3. Nettoyage
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import re
import seaborn as sns
import missingno as msno
import clean #mon module de fontions qui me permettent de nettoyer un dataframe
import plot_
import barcodenumber
# Filter warnings
import warnings
warnings.filterwarnings('ignore')
Je sélectionne les colonnes qui m'intéressent en fonction de mon idée d'application :
- données d'entrée : le code bar du produit,
- données de sorties : score et recommandation adressé au population diabétique (pour éviter l'hyperglycémie).
Les variables qui m'intéressent :
df = pd.read_csv('df_export.csv',
dtype={'nova_group': 'category',
'nutriscore_grade': 'category',
'code':'object',
'pnns_groups_1':'category'
})
df.head()
| code | url | product_name | brands | countries_en | pnns_groups_1 | product_quantity | ingredients_text | sugars_100g | saturated-fat_100g | fiber_100g | salt_100g | additives_n | energy-kcal_100g | carbohydrates_100g | proteins_100g | nova_group | nutriscore_grade | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 00000000000000225 | http://world-en.openfoodfacts.org/product/0000... | jeunes pousses | endives | France | unknown | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 0000000000000207025004 | http://world-en.openfoodfacts.org/product/0000... | Andrè | NaN | Germany | unknown | NaN | NaN | 12.6 | 2.00 | 3.0 | NaN | NaN | 165.0 | 65.0 | 1.5 | NaN | NaN |
| 2 | 00000000000003429145 | http://world-en.openfoodfacts.org/product/0000... | L.casei | NaN | Spain | unknown | NaN | Leche semidesnatada, azucar 6.9%, leche desnat... | 9.8 | 0.90 | NaN | 0.10 | 0.0 | NaN | 9.8 | 2.7 | NaN | NaN |
| 3 | 00000000000026772226 | http://world-en.openfoodfacts.org/product/0000... | Skyr | Danone | France | Milk and dairy products | 480.0 | NaN | 3.9 | 0.10 | NaN | 0.09 | NaN | 57.0 | 3.9 | 10.0 | NaN | a |
| 4 | 0000000000017 | http://world-en.openfoodfacts.org/product/0000... | Vitória crackers | NaN | France | unknown | NaN | NaN | 15.0 | 3.08 | NaN | 1.40 | NaN | 375.0 | 70.1 | 7.8 | NaN | NaN |
clean.vm(df, 0.001,'P3_21a.png')
code 0.000000 url 0.000000 product_name 0.035049 brands 0.509620 countries_en 0.002706 pnns_groups_1 0.000508 product_quantity 0.758517 ingredients_text 0.702619 sugars_100g 0.218824 saturated-fat_100g 0.228532 fiber_100g 0.661975 salt_100g 0.293519 additives_n 0.702618 energy-kcal_100g 0.225170 carbohydrates_100g 0.212735 proteins_100g 0.212000 nova_group 0.729291 nutriscore_grade 0.682441 dtype: float64
clean.info(df)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2718738 entries, 0 to 2718737
Data columns (total 18 columns):
# Column Dtype
--- ------ -----
0 code object
1 url object
2 product_name object
3 brands object
4 countries_en object
5 pnns_groups_1 category
6 product_quantity float64
7 ingredients_text object
8 sugars_100g float64
9 saturated-fat_100g float64
10 fiber_100g float64
11 salt_100g float64
12 additives_n float64
13 energy-kcal_100g float64
14 carbohydrates_100g float64
15 proteins_100g float64
16 nova_group category
17 nutriscore_grade category
dtypes: category(3), float64(9), object(6)
memory usage: 318.9+ MB
None
product_quantity sugars_100g saturated-fat_100g fiber_100g \
count 6.565300e+05 2.123812e+06 2.097419e+06 9.190010e+05
mean 1.783632e+13 4.708846e+07 4.767830e+05 1.067820e+41
std 1.123689e+16 6.861862e+10 6.904900e+08 1.023661e+44
min 0.000000e+00 -1.700000e+01 0.000000e+00 -6.700000e+00
25% 1.500000e+02 5.000000e-01 1.000000e-01 0.000000e+00
50% 2.834952e+02 3.530000e+00 1.785714e+00 1.600000e+00
75% 5.000000e+02 1.700000e+01 7.000000e+00 3.800000e+00
max 9.000000e+18 1.000000e+14 1.000000e+12 9.813275e+46
salt_100g additives_n energy-kcal_100g carbohydrates_100g \
count 1.920736e+06 808503.000000 2.106561e+06 2.140367e+06
mean 5.206338e+73 1.995217 2.490905e+39 3.252318e+03
std 7.215496e+76 2.861190 3.615298e+42 4.716338e+06
min 0.000000e+00 0.000000 -5.300000e+01 -1.000000e+00
25% 1.000000e-01 0.000000 1.000000e+02 3.570000e+00
50% 6.000000e-01 1.000000 2.590000e+02 1.577000e+01
75% 1.400000e+00 3.000000 4.000000e+02 5.300000e+01
max 1.000000e+80 50.000000 5.247243e+45 6.900000e+09
proteins_100g
count 2.142366e+06
mean 2.550214e+68
std 3.732703e+71
min -4.600000e+01
25% 1.300000e+00
50% 6.000000e+00
75% 1.250000e+01
max 5.463492e+74
Je regroupe les catégories PNNS_GROUPS_1 en fonction des groupes alimentaires identifiés par Santé Publique France.
df['pnns_groups_1'].unique()
['unknown', 'Milk and dairy products', 'Fat and sauces', 'Beverages', 'Sugary snacks', ..., 'Cereals and potatoes', 'Salty snacks', 'Alcoholic beverages', NaN, 'sugary-snacks'] Length: 13 Categories (12, object): ['Alcoholic beverages', 'Beverages', 'Cereals and potatoes', 'Composite foods', ..., 'Salty snacks', 'Sugary snacks', 'unknown', 'sugary-snacks']
mappings = {'Milk and dairy products' : 'Produits Laitiers',
'Fat and sauces' : 'Matières grasses et sauces',
'Beverages':'Boissons',
'Sugary snacks' : 'Sucreries',
'sugary-snacks':'Sucreries',
'Alcoholic beverages':'Sucreries',
'unknown': np.nan,
'Composite foods': 'Plats cuisinés',
'Fruits and vegetables': 'Fruits et légumes',
'Fish Meat Eggs': 'Viandes',
'Cereals and potatoes': 'Céréales et pomme de terre',
'Salty snacks':'Collation salée'}
df['pnns_groups_1'] = df['pnns_groups_1'].replace(mappings)
df['pnns_groups_1'].describe()
count 908222 unique 9 top Sucreries freq 212319 Name: pnns_groups_1, dtype: object
fig, ax = plot_.countplot(df,'pnns_groups_1','P3_22a.png',45,'non')
fig, ax = plot_.countplot(df,'nova_group','P3_22b.png',0,'oui')
ax.set_xlabel('Groupe NOVA')
Text(0.5, 60.20833333333333, 'Groupe NOVA')
fig, ax = plot_.countplot(df,'nutriscore_grade','P3_22c.png',0,'oui')
Utilisation de la fonction homostr de mon module de nettoyage. Pour chaque variable de type object, elle transforme tous les caractères en lettres capitales et supprime les espaces en amont et à l'aval.
df = clean.homostr(df)
variable CODE variable URL variable PRODUCT_NAME variable BRANDS variable COUNTRIES_EN variable INGREDIENTS_TEXT
#Vérfication
#clean.supp_esp(df,'BRANDS')
#OK
##Sélection des produits vendus en France
df['COUNTRIES_EN'] = df['COUNTRIES_EN'].str.upper()
df = df[df['COUNTRIES_EN'].str.contains('FRANCE')]
## Suppression de la colonne Countries
df = df.drop(columns = 'COUNTRIES_EN')
Pour les variables qualitatives CODE, URL, PRODUCT_NAME, BRANDS, et INGREDIENTS_TEXT, l'analyse univarié n'est pas pertinente. Sans considérer d'éventuels doublons, chaque individu à un CODE unique, un URL unique, un PRODUCT_NAME etc...
Les URL : Aucun des url testés ne fonctionnent, je remplace le 'WORLD-EN.OPENFOODFACTS.ORG/PRODUCT' par 'world.openfoodfacts.org/product' (suppression du '-en')
url_ok = 'https://world.openfoodfacts.org/product/'
print(url_ok,len(url_ok))
url_nok = 'HTTP://WORLD-EN.OPENFOODFACTS.ORG/PRODUCT/'
print(url_nok,len(url_nok))
https://world.openfoodfacts.org/product/ 40 HTTP://WORLD-EN.OPENFOODFACTS.ORG/PRODUCT/ 42
print('Les URL commençant par HTTP://WORLD-EN.OPENFOODFACTS.ORG/PRODUCT/, représentent',
(df[df['URL'].str.startswith(url_nok)].shape[0]/df.shape[0])*100,
' % des produits du dataframe')
Les URL commençant par HTTP://WORLD-EN.OPENFOODFACTS.ORG/PRODUCT/, représentent 100.0 % des produits du dataframe
Tous les URL commencent par 'HTTP://WORLD-EN.OPENFOODFACTS.ORG/PRODUCT/' :
url_ok = 'https://world.openfoodfacts.org/product/'
for i, values in df['URL'].items():
ref = values[42:]
df.loc[i,'URL'] = url_ok+ref
Certains URLs ne comprennent pas de code, c'est URL sont erronés ou mène à des produits sans désignation (i.e sans PRODUCT_NAME) dans le BDD.
df_notEAN = df[['URL']]
ind= []
for i,row in df_notEAN.iterrows():
df_tamp = row['URL']
#print('individu',i)
df_tamp_b = df_tamp[41:]
if not ('/' in df_tamp_b):
#print('Pas de nom de produit, voir variable ind')
ind.append(i)
df.loc[ind].PRODUCT_NAME.unique()
array(['NAN', '.', ')', ',', '?', '??', '???', '..', '??????????????',
'####*'], dtype=object)
df = df.drop(ind)
Les codes barres correspondent aux nomenclature EAN13 ou à celles propres aux magasins de vente. Je crée la variable 'LEN_CODE' qui à chaque individu associe la longueur de son code.
df[df['CODE'].str.contains('\W')]
#Aucun des codes ne contient de caractères spéciaux
| CODE | URL | PRODUCT_NAME | BRANDS | PNNS_GROUPS_1 | PRODUCT_QUANTITY | INGREDIENTS_TEXT | SUGARS_100G | SATURATED-FAT_100G | FIBER_100G | SALT_100G | ADDITIVES_N | ENERGY-KCAL_100G | CARBOHYDRATES_100G | PROTEINS_100G | NOVA_GROUP | NUTRISCORE_GRADE |
|---|
df['LEN_CODE'] = df['CODE'].str.len()
fig, ax = plt.subplots(figsize = (12,7),dpi=150)
sns.set()
sns.countplot(df['LEN_CODE'])
#plt.setp(ax.get_xticklabels(), rotation=rot, ha="right",rotation_mode="anchor")
ax.set_xlabel('Nombre de chiffre',labelpad = 15, fontsize = 15)
ax.set_ylabel('Effectif',labelpad = 15, fontsize = 15)
ax.set_yscale('log')
plt.savefig('P3_31b.png',dpi = 150,bbox_inches = 'tight')
df['LEN_CODE'].describe()
count 931937.000000 mean 12.671474 std 1.350861 min 1.000000 25% 13.000000 50% 13.000000 75% 13.000000 max 45.000000 Name: LEN_CODE, dtype: float64
#Forme de la distribution
print('gamma1 = ',df['LEN_CODE'].skew(),'Facteur d\'asymétrie')
print('gamma2 = ',df['LEN_CODE'].kurtosis(),'Facteur d\' aplatissement')
gamma1 = -1.9713444534697477 Facteur d'asymétrie gamma2 = 16.64726461007067 Facteur d' aplatissement
barcodenumber.barcodes()
barcodenumber.check_code('EAN13', '0011110011657')
True
Le mode de la longueur du code est 13 et correspond bien à la norme EAN13. J'ai tenté de vérifier la validité des codes barres à l'aide du module barcodenumber. Le module ne fonctionne pas convenablement et requiert que la norme du code barre soit fixée. Dans notre BDD aucune norme n'est fixée. Un code à 12 chiffre peut être un UPC-A ou un code à 13 chiffre EAN13 erroné. Dans le cadre de ce projet il m'a été conseillé de ne pas approfondir d'avantage.
Certains codes ont une longueur abbérante.
Exemples ci-dessous :
#df[df['LEN_CODE']==1]
#df[df['LEN_CODE']==45]
#Suppression des produits aux codes abbérants
df = df[~(df['LEN_CODE']==1)]
df= df[~(df['LEN_CODE']==45)]
Je choisie de sélectionner un échantillon d'individu en utilisant l'écart-type de la variable LEN_CODE, et ainsi de réduire la probabilité d'avoir des codes abbérants ou erronés.
#Utilisation de l'écart type pour récupérer un échantillon
sigma3 = 3*df['LEN_CODE'].describe().loc['std'] #3 écart type.
#Écart-type petit, la distribution est resserrée.
#Intervalle de entre a et b
mu = df['LEN_CODE'].describe().loc['mean']
a = mu + sigma3
b = mu - sigma3
a,b
(16.722187612540942, 8.62079020129097)
df[(df['LEN_CODE'] <14)& (df['LEN_CODE'] >7)]
#https://world.openfoodfacts.org/product/004223000000011274140720
print('Cette échantillon représente',
(round(df[(df['LEN_CODE'] <14)& (df['LEN_CODE'] >7)].shape[0]/df.shape[0],2)*100),
'% du dataset')
Cette échantillon représente 99.0 % du dataset
df = df[(df['LEN_CODE'] <14)& (df['LEN_CODE'] >7)]
df['LEN_CODE']= df['CODE'].str.len()
df['LEN_CODE'].describe()
count 918781.000000 mean 12.680632 std 1.123535 min 8.000000 25% 13.000000 50% 13.000000 75% 13.000000 max 13.000000 Name: LEN_CODE, dtype: float64
df = df.drop(columns = 'LEN_CODE')
En appliquant la même stratégie que pour la variable CODE, je crée une variable LEN_PRODUCT, la longueur du PRODUCT_NAME de chaque individu, pour repérer les valeurs abbérantes.
df['LEN_PRODUCT']= df['PRODUCT_NAME'].str.len()
df['LEN_PRODUCT'].describe()
count 918781.000000 mean 21.793829 std 11.827348 min 0.000000 25% 14.000000 50% 19.000000 75% 28.000000 max 279.000000 Name: LEN_PRODUCT, dtype: float64
sns.set()
sns.boxplot(df['LEN_PRODUCT'])
<AxesSubplot:xlabel='LEN_PRODUCT'>
df[df['LEN_PRODUCT']== df['LEN_PRODUCT'].max()]
| CODE | URL | PRODUCT_NAME | BRANDS | PNNS_GROUPS_1 | PRODUCT_QUANTITY | INGREDIENTS_TEXT | SUGARS_100G | SATURATED-FAT_100G | FIBER_100G | SALT_100G | ADDITIVES_N | ENERGY-KCAL_100G | CARBOHYDRATES_100G | PROTEINS_100G | NOVA_GROUP | NUTRISCORE_GRADE | LEN_PRODUCT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1169288 | 3289476000005 | https://world.openfoodfacts.org/product/328947... | MÉLANGE DE LÉGUMES (COURGETTE GRILLÉE, POIVRON... | LA COMPAGNIE ARTIQUE | Plats cuisinés | 380.0 | COURGETTE GRILLÉE* 27,8%, TOMATE CONCASSÉE* 24... | 4.2 | 1.5 | 2.1 | 0.28 | 0.0 | 78.0 | 5.2 | 3.6 | 3 | a | 279 |
#Les valeurs abberantes
df[df['LEN_PRODUCT']<3]
| CODE | URL | PRODUCT_NAME | BRANDS | PNNS_GROUPS_1 | PRODUCT_QUANTITY | INGREDIENTS_TEXT | SUGARS_100G | SATURATED-FAT_100G | FIBER_100G | SALT_100G | ADDITIVES_N | ENERGY-KCAL_100G | CARBOHYDRATES_100G | PROTEINS_100G | NOVA_GROUP | NUTRISCORE_GRADE | LEN_PRODUCT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4072 | 00052337 | https://world.openfoodfacts.org/product/000523... | M&S | NaN | 120.0 | NAN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | |
| 38357 | 16165003060 | https://world.openfoodfacts.org/product/001616... | FE | NAN | NaN | NaN | NAN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2 |
| 317996 | 75707098155 | https://world.openfoodfacts.org/product/007570... | H | NAN | NaN | NaN | NAN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 |
| 408620 | 0201987013124 | https://world.openfoodfacts.org/product/020198... | V6 | NAN | NaN | NaN | YY | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | 2 |
| 417662 | 0205660039236 | https://world.openfoodfacts.org/product/020566... | L | L | NaN | NaN | NAN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2625087 | 8803560000143 | https://world.openfoodfacts.org/product/880356... | 쌀떡 | 송학 | Produits Laitiers | 500.0 | NAN | 2.0 | 0.0 | 54.0 | 0.1778 | NaN | 235.0 | NaN | 3.0 | NaN | a | 2 |
| 2635733 | 8859090475050 | https://world.openfoodfacts.org/product/885909... | G | NAN | NaN | NaN | NAN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 |
| 2708322 | 9639864340 | https://world.openfoodfacts.org/product/963986... | 肉 | NAN | NaN | NaN | NAN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 |
| 2709965 | 9782070449941 | https://world.openfoodfacts.org/product/978207... | LE | NAN | NaN | NaN | NAN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2 |
| 2713396 | 99912345678 | https://world.openfoodfacts.org/product/999123... | EF | NAN | NaN | NaN | ILS | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | 2 |
89 rows × 18 columns
df[df['LEN_PRODUCT']< 3].PRODUCT_NAME.unique()
array(['', 'FE', 'H', 'V6', 'L', 'BN', 'P', 'AV', 'LT', 'LR', '01', '02',
'03', '04', '05', '06', '08', '09', '10', '11', '12', '13', '14',
'15', '16', '17', '18', '19', '20', '21', '22', 'BI', 'UP', 'SP',
'IL', 'K', 'CC', 'MG', 'U', '1', 'G', '?', 'O', 'S', 'HF', 'M’',
'SA', '📖', 'JE', 'BO', 'OE', 'DR', 'BJ', '8', 'BB', 'PQ', '曾粉',
'TF', '鈣片', 'M', 'OR', 'ÆG', 'HG', 'OK', '冰露', '2', 'HA', 'يب',
'FF', 'LU', '86', '쌀떡', '肉', 'LE', 'EF'], dtype=object)
#Suppression
df =df[~(df['LEN_PRODUCT']<3)]
df[df['PRODUCT_NAME'].str.contains('\W', regex=True)]
| CODE | URL | PRODUCT_NAME | BRANDS | PNNS_GROUPS_1 | PRODUCT_QUANTITY | INGREDIENTS_TEXT | SUGARS_100G | SATURATED-FAT_100G | FIBER_100G | SALT_100G | ADDITIVES_N | ENERGY-KCAL_100G | CARBOHYDRATES_100G | PROTEINS_100G | NOVA_GROUP | NUTRISCORE_GRADE | LEN_PRODUCT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4 | 0000000000017 | https://world.openfoodfacts.org/product/000000... | VITÓRIA CRACKERS | NAN | NaN | NaN | NAN | 15.0 | 3.08 | NaN | 1.40 | NaN | 375.0 | 70.1 | 7.8 | NaN | NaN | 16 |
| 11 | 0000000000100 | https://world.openfoodfacts.org/product/000000... | MOUTARDE AU MOÛT DE RAISIN | COURTE PAILLE | Matières grasses et sauces | 100.0 | EAU GRAINES DE TÉGUMENTS DE MOUTARDE VINAIGRE ... | 22.0 | 2.20 | 0.0 | 4.60 | 0.0 | NaN | 29.0 | 5.1 | NaN | d | 26 |
| 12 | 0000000000123 | https://world.openfoodfacts.org/product/000000... | SAUCE SWEETY CHILI 0% | NAN | NaN | NaN | NAN | 0.4 | 0.00 | NaN | 2.04 | NaN | 21.0 | 4.8 | 0.2 | NaN | NaN | 21 |
| 13 | 0000000000178 | https://world.openfoodfacts.org/product/000000... | MINI COCO | NAN | NaN | NaN | NAN | 3.0 | 1.00 | NaN | 1.15 | NaN | 60.0 | 10.0 | 2.0 | NaN | NaN | 9 |
| 14 | 0000000000208 | https://world.openfoodfacts.org/product/000000... | PISTOU D'AIL DES OURS | NAN | NaN | NaN | NAN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 21 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2718718 | 9999992756112 | https://world.openfoodfacts.org/product/999999... | STEAK HACHÉ | NAN | NaN | 1000.0 | NAN | 0.0 | 6.20 | NaN | 0.19 | NaN | 196.0 | 0.0 | 19.0 | NaN | NaN | 11 |
| 2718725 | 9999999004360 | https://world.openfoodfacts.org/product/999999... | MINIS BEIGNETS | NAN | NaN | NaN | NAN | 10.4 | 6.00 | NaN | 1.10 | NaN | 333.0 | 30.9 | 7.4 | NaN | d | 14 |
| 2718729 | 99999995 | https://world.openfoodfacts.org/product/999999... | STEAK HACHÉ 5% | NAN | Viandes | NaN | VIANDE HACHEE PUR BOEUF 5% M,G, POURCENTAGE DE... | 0.0 | 2.30 | NaN | 0.18 | 0.0 | 131.0 | 0.0 | 21.5 | NaN | a | 14 |
| 2718733 | 999999999 | https://world.openfoodfacts.org/product/999999... | THÉ NOIR BIO DARJEELING | PAGÈS | NaN | NaN | NAN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 23 |
| 2718736 | 9999999999994 | https://world.openfoodfacts.org/product/999999... | LIGHT & FREE SKYR A BOIRE | NAN | Produits Laitiers | NaN | NAN | 49.0 | 13.00 | 6.0 | 3.00 | NaN | 24.0 | 70.0 | 2.0 | NaN | e | 25 |
811787 rows × 18 columns
df['PRODUCT_NAME'] = df['PRODUCT_NAME'].str.replace('\W',' ', regex=True)
pattern = r'[^a-zA-Z]'
df[df['PRODUCT_NAME'].str.contains(pattern, regex=True)]
| CODE | URL | PRODUCT_NAME | BRANDS | PNNS_GROUPS_1 | PRODUCT_QUANTITY | INGREDIENTS_TEXT | SUGARS_100G | SATURATED-FAT_100G | FIBER_100G | SALT_100G | ADDITIVES_N | ENERGY-KCAL_100G | CARBOHYDRATES_100G | PROTEINS_100G | NOVA_GROUP | NUTRISCORE_GRADE | LEN_PRODUCT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4 | 0000000000017 | https://world.openfoodfacts.org/product/000000... | VITÓRIA CRACKERS | NAN | NaN | NaN | NAN | 15.0 | 3.08 | NaN | 1.40 | NaN | 375.0 | 70.1 | 7.8 | NaN | NaN | 16 |
| 11 | 0000000000100 | https://world.openfoodfacts.org/product/000000... | MOUTARDE AU MOÛT DE RAISIN | COURTE PAILLE | Matières grasses et sauces | 100.0 | EAU GRAINES DE TÉGUMENTS DE MOUTARDE VINAIGRE ... | 22.0 | 2.20 | 0.0 | 4.60 | 0.0 | NaN | 29.0 | 5.1 | NaN | d | 26 |
| 12 | 0000000000123 | https://world.openfoodfacts.org/product/000000... | SAUCE SWEETY CHILI 0 | NAN | NaN | NaN | NAN | 0.4 | 0.00 | NaN | 2.04 | NaN | 21.0 | 4.8 | 0.2 | NaN | NaN | 21 |
| 13 | 0000000000178 | https://world.openfoodfacts.org/product/000000... | MINI COCO | NAN | NaN | NaN | NAN | 3.0 | 1.00 | NaN | 1.15 | NaN | 60.0 | 10.0 | 2.0 | NaN | NaN | 9 |
| 14 | 0000000000208 | https://world.openfoodfacts.org/product/000000... | PISTOU D AIL DES OURS | NAN | NaN | NaN | NAN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 21 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2718718 | 9999992756112 | https://world.openfoodfacts.org/product/999999... | STEAK HACHÉ | NAN | NaN | 1000.0 | NAN | 0.0 | 6.20 | NaN | 0.19 | NaN | 196.0 | 0.0 | 19.0 | NaN | NaN | 11 |
| 2718725 | 9999999004360 | https://world.openfoodfacts.org/product/999999... | MINIS BEIGNETS | NAN | NaN | NaN | NAN | 10.4 | 6.00 | NaN | 1.10 | NaN | 333.0 | 30.9 | 7.4 | NaN | d | 14 |
| 2718729 | 99999995 | https://world.openfoodfacts.org/product/999999... | STEAK HACHÉ 5 | NAN | Viandes | NaN | VIANDE HACHEE PUR BOEUF 5% M,G, POURCENTAGE DE... | 0.0 | 2.30 | NaN | 0.18 | 0.0 | 131.0 | 0.0 | 21.5 | NaN | a | 14 |
| 2718733 | 999999999 | https://world.openfoodfacts.org/product/999999... | THÉ NOIR BIO DARJEELING | PAGÈS | NaN | NaN | NAN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 23 |
| 2718736 | 9999999999994 | https://world.openfoodfacts.org/product/999999... | LIGHT FREE SKYR A BOIRE | NAN | Produits Laitiers | NaN | NAN | 49.0 | 13.00 | 6.0 | 3.00 | NaN | 24.0 | 70.0 | 2.0 | NaN | e | 25 |
820176 rows × 18 columns
pattern = [r'\d+\s',r'^\d\d\d+']
df[df['PRODUCT_NAME'].str.contains(pattern[0], regex=True)]
#Le pattern[0] : les individus dont le PRODUCT_NAME commence par une série de chiffre
#Le pattern[1] : les individus dont le PRODUCT_NAME est une série de chiffre
| CODE | URL | PRODUCT_NAME | BRANDS | PNNS_GROUPS_1 | PRODUCT_QUANTITY | INGREDIENTS_TEXT | SUGARS_100G | SATURATED-FAT_100G | FIBER_100G | SALT_100G | ADDITIVES_N | ENERGY-KCAL_100G | CARBOHYDRATES_100G | PROTEINS_100G | NOVA_GROUP | NUTRISCORE_GRADE | LEN_PRODUCT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 12 | 0000000000123 | https://world.openfoodfacts.org/product/000000... | SAUCE SWEETY CHILI 0 | NAN | NaN | NaN | NAN | 0.4 | 0.0 | NaN | 2.04 | NaN | 21.0 | 4.80 | 0.20 | NaN | NaN | 21 |
| 116 | 0000000014120 | https://world.openfoodfacts.org/product/000000... | SALADE FRAÎCHEUR 3 FROMAGES | NAN | NaN | NaN | NAN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 27 |
| 125 | 0000000015707 | https://world.openfoodfacts.org/product/000000... | 8 BUCHETTES POULET CŒUR AUX FIGUES | NAN | NaN | NaN | NAN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 34 |
| 193 | 000000002922 | https://world.openfoodfacts.org/product/000000... | CARANOA 55 | NAN | NaN | 1500.0 | NAN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 11 |
| 292 | 0000000039529 | https://world.openfoodfacts.org/product/000000... | PACK DE 2 TWIX | TWIX,LUNDBERG | Sucreries | NaN | NAN | NaN | NaN | 6.2 | NaN | NaN | 354.0 | 77.08 | 6.25 | NaN | NaN | 14 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2713143 | 9950014911001 | https://world.openfoodfacts.org/product/995001... | OIGNONS JAUNES 40 60 | FERME DE L'ARTOIS | Fruits et légumes | NaN | JAUNEO 4V OU POIDS NET : 2 KG CATL ORIG MM LOT... | 0.5 | 5.0 | 25.0 | 0.20 | 0.0 | 0.0 | 2.00 | 32.00 | NaN | a | 20 |
| 2713163 | 9951636013593 | https://world.openfoodfacts.org/product/995163... | CARTE CADEAU XBOX 50 | NAN | NaN | NaN | NAN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 21 |
| 2713230 | 99660000 | https://world.openfoodfacts.org/product/996600... | 100 PUR JUS DE POMME | NAN | NaN | NaN | NAN | 10.5 | 0.1 | NaN | 0.00 | NaN | 47.0 | 11.00 | 0.10 | NaN | NaN | 21 |
| 2713346 | 9990000000074 | https://world.openfoodfacts.org/product/999000... | GRAINES DE COURGE CHOCOLAT NOIR 57 | NAN | NaN | NaN | NAN | 33.0 | 20.0 | NaN | 0.15 | NaN | 555.0 | 36.00 | 12.50 | NaN | NaN | 35 |
| 2718729 | 99999995 | https://world.openfoodfacts.org/product/999999... | STEAK HACHÉ 5 | NAN | Viandes | NaN | VIANDE HACHEE PUR BOEUF 5% M,G, POURCENTAGE DE... | 0.0 | 2.3 | NaN | 0.18 | 0.0 | 131.0 | 0.00 | 21.50 | NaN | a | 14 |
59368 rows × 18 columns
df['PRODUCT_NAME'] = df['PRODUCT_NAME'].str.replace(pattern[0], '', regex=True)
ind = df[df['PRODUCT_NAME'].str.contains(pattern[1], regex=True)].index
df = df.drop(ind)
pattern = r'\w'
df[df['PRODUCT_NAME'].str.contains(pattern, regex=True)]
| CODE | URL | PRODUCT_NAME | BRANDS | PNNS_GROUPS_1 | PRODUCT_QUANTITY | INGREDIENTS_TEXT | SUGARS_100G | SATURATED-FAT_100G | FIBER_100G | SALT_100G | ADDITIVES_N | ENERGY-KCAL_100G | CARBOHYDRATES_100G | PROTEINS_100G | NOVA_GROUP | NUTRISCORE_GRADE | LEN_PRODUCT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4 | 0000000000017 | https://world.openfoodfacts.org/product/000000... | VITÓRIA CRACKERS | NAN | NaN | NaN | NAN | 15.0 | 3.08 | NaN | 1.40 | NaN | 375.0 | 70.1 | 7.8 | NaN | NaN | 16 |
| 5 | 0000000000031 | https://world.openfoodfacts.org/product/000000... | CACAO | NAN | NaN | 130.0 | NAN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 5 |
| 11 | 0000000000100 | https://world.openfoodfacts.org/product/000000... | MOUTARDE AU MOÛT DE RAISIN | COURTE PAILLE | Matières grasses et sauces | 100.0 | EAU GRAINES DE TÉGUMENTS DE MOUTARDE VINAIGRE ... | 22.0 | 2.20 | 0.0 | 4.60 | 0.0 | NaN | 29.0 | 5.1 | NaN | d | 26 |
| 12 | 0000000000123 | https://world.openfoodfacts.org/product/000000... | SAUCE SWEETY CHILI | NAN | NaN | NaN | NAN | 0.4 | 0.00 | NaN | 2.04 | NaN | 21.0 | 4.8 | 0.2 | NaN | NaN | 21 |
| 13 | 0000000000178 | https://world.openfoodfacts.org/product/000000... | MINI COCO | NAN | NaN | NaN | NAN | 3.0 | 1.00 | NaN | 1.15 | NaN | 60.0 | 10.0 | 2.0 | NaN | NaN | 9 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2718722 | 99999988 | https://world.openfoodfacts.org/product/999999... | BOULES | NAN | NaN | NaN | NAN | 54.0 | 1.30 | NaN | 0.02 | NaN | 372.0 | 58.0 | 6.3 | NaN | NaN | 6 |
| 2718725 | 9999999004360 | https://world.openfoodfacts.org/product/999999... | MINIS BEIGNETS | NAN | NaN | NaN | NAN | 10.4 | 6.00 | NaN | 1.10 | NaN | 333.0 | 30.9 | 7.4 | NaN | d | 14 |
| 2718729 | 99999995 | https://world.openfoodfacts.org/product/999999... | STEAK HACHÉ | NAN | Viandes | NaN | VIANDE HACHEE PUR BOEUF 5% M,G, POURCENTAGE DE... | 0.0 | 2.30 | NaN | 0.18 | 0.0 | 131.0 | 0.0 | 21.5 | NaN | a | 14 |
| 2718733 | 999999999 | https://world.openfoodfacts.org/product/999999... | THÉ NOIR BIO DARJEELING | PAGÈS | NaN | NaN | NAN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 23 |
| 2718736 | 9999999999994 | https://world.openfoodfacts.org/product/999999... | LIGHT FREE SKYR A BOIRE | NAN | Produits Laitiers | NaN | NAN | 49.0 | 13.00 | 6.0 | 3.00 | NaN | 24.0 | 70.0 | 2.0 | NaN | e | 25 |
917701 rows × 18 columns
df['PRODUCT_NAME'] = df['PRODUCT_NAME'].str.split(n=3)
df['PRODUCT_NAME'] = df['PRODUCT_NAME'].str[:3]
df['LEN_PRODUCT']= df['PRODUCT_NAME'].str.len()
df['LEN_PRODUCT'].describe()
count 917714.000000 mean 2.544809 std 0.696738 min 0.000000 25% 2.000000 50% 3.000000 75% 3.000000 max 3.000000 Name: LEN_PRODUCT, dtype: float64
df['PRODUCT_NAME'] = df['PRODUCT_NAME'].str.join(',')
df['PRODUCT_NAME'] = df['PRODUCT_NAME'].str.replace(',', ' ')
df = df.drop(columns = 'LEN_PRODUCT')
df['LEN_BRANDS']= df['BRANDS'].str.len()
df['LEN_BRANDS'].describe()
count 917714.000000 mean 7.351860 std 16.261082 min 0.000000 25% 3.000000 50% 4.000000 75% 9.000000 max 13566.000000 Name: LEN_BRANDS, dtype: float64
sns.set()
sns.boxplot(df['LEN_BRANDS'])
<AxesSubplot:xlabel='LEN_BRANDS'>
df[df['LEN_BRANDS']== df['LEN_BRANDS'].min()] #0 étant le minimum
| CODE | URL | PRODUCT_NAME | BRANDS | PNNS_GROUPS_1 | PRODUCT_QUANTITY | INGREDIENTS_TEXT | SUGARS_100G | SATURATED-FAT_100G | FIBER_100G | SALT_100G | ADDITIVES_N | ENERGY-KCAL_100G | CARBOHYDRATES_100G | PROTEINS_100G | NOVA_GROUP | NUTRISCORE_GRADE | LEN_BRANDS | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1051497 | 3162780983052 | https://world.openfoodfacts.org/product/316278... | BAGUETTE DE PAIN | NaN | NaN | NAN | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
#Les valeurs abberantes
ls_ab = df[df['LEN_BRANDS']< 2].BRANDS.unique().tolist()
ls_ab
['U', 'M', 'T', '?', 'X', 'C', 'D', 'R', '-', '1', '7', '/', 'G', 'S', '', 'J', 'I', 'A', '6', '—', 'Y', 'Z', 'E', '0', 'Q', 'Е', 'K', 'P', '.', 'V']
#Je retire la marque U de la listes des marques abbérantes
ls_ab.remove('U')
#ls_ab
for i in ls_ab:
df.loc[df['BRANDS']==i,'BRANDS'] = np.nan
df[df['LEN_BRANDS']== df['LEN_BRANDS'].max()]
| CODE | URL | PRODUCT_NAME | BRANDS | PNNS_GROUPS_1 | PRODUCT_QUANTITY | INGREDIENTS_TEXT | SUGARS_100G | SATURATED-FAT_100G | FIBER_100G | SALT_100G | ADDITIVES_N | ENERGY-KCAL_100G | CARBOHYDRATES_100G | PROTEINS_100G | NOVA_GROUP | NUTRISCORE_GRADE | LEN_BRANDS | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1598725 | 4061458040846 | https://world.openfoodfacts.org/product/406145... | MEHL WEIZEN TYP | MÜHLEN GOLD, MÜHLENGOLD, GOLDÄHREN, MÜHLEN GOL... | Céréales et pomme de terre | 1000.0 | WEIZENMEHL. | 0.5 | 0.2 | NaN | 0.01 | 0.0 | 340.0 | 72.0 | 10.0 | 1 | a | 13566 |
pattern = [r'\d+\s',r'^\d\d\d+']
df[df['PRODUCT_NAME'].str.contains(pattern[1], regex=True)]
| CODE | URL | PRODUCT_NAME | BRANDS | PNNS_GROUPS_1 | PRODUCT_QUANTITY | INGREDIENTS_TEXT | SUGARS_100G | SATURATED-FAT_100G | FIBER_100G | SALT_100G | ADDITIVES_N | ENERGY-KCAL_100G | CARBOHYDRATES_100G | PROTEINS_100G | NOVA_GROUP | NUTRISCORE_GRADE | LEN_BRANDS | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1726082 | 4901630002104 | https://world.openfoodfacts.org/product/490163... | 115G | NAN | NaN | NaN | NAN | 24.0 | 0.0 | NaN | 0.0 | NaN | 120.0 | 24.0 | 0.0 | NaN | NaN | 3 |
ind = df[df['PRODUCT_NAME'].str.contains(pattern[1], regex=True)].index
df = df.drop(ind)
df['BRANDS'] = df['BRANDS'].str.split(n=2)
df['BRANDS'] = df['BRANDS'].str[:2]
df['BRANDS'] = df['BRANDS'].str.join(',')
df['BRANDS'] = df['BRANDS'].str.replace(',', ' ')
df[df['LEN_BRANDS']== df['LEN_BRANDS'].max()].URL.values
array(['https://world.openfoodfacts.org/product/4061458040846/MEHL-WEIZEN-TYP-405-MUHLEN-GOLD'],
dtype=object)
Pour la variable BRAND je choisi de ne prendre que la première partie de la valeur. Je divise chaque valeur en utilisant ',' comme séparateur.
Exemple :
#Test
#df.loc[1598725,'BRANDS'].split(',')[0]
La médiane du nombre de caractère pour la variable BRAND est 4 et le 3ème quartile est 9. J'implémente l'algorithme du split pour tous les individus tels que LEN_BRANDS > 9
#df.loc[1598725,'BRANDS']='MÜHLEN GOLD'
#df.loc[2045031,'BRANDS']='COOP'
#df.loc[1680917,'BRANDS']='GUT & GÜNSTIG'
#Vérification
df.loc[1598725,'BRANDS']
'MÜHLEN GOLD '
df['LEN_BRANDS']= df['BRANDS'].str.len()
sns.set()
sns.boxplot(df['LEN_BRANDS'])
<AxesSubplot:xlabel='LEN_BRANDS'>
df = df.drop(columns = 'LEN_BRANDS')
df['LEN_INGREDIENTS'] = df['INGREDIENTS_TEXT'].str.len()
df['LEN_INGREDIENTS'].describe()
count 917713.000000 mean 75.370953 std 175.630549 min 0.000000 25% 3.000000 50% 3.000000 75% 49.000000 max 7731.000000 Name: LEN_INGREDIENTS, dtype: float64
sns.set()
sns.boxplot(df['LEN_INGREDIENTS'])
<AxesSubplot:xlabel='LEN_INGREDIENTS'>
df[df['LEN_INGREDIENTS']==df['LEN_INGREDIENTS'].min()].INGREDIENTS_TEXT.unique()
array([''], dtype=object)
pattern.append('^\w\s')
pattern
['\\d+\\s', '^\\d\\d\\d+', '^\\w\\s']
df.loc[df['LEN_INGREDIENTS'] <= 2,'INGREDIENTS_TEXT']=np.nan
df.loc[df['INGREDIENTS_TEXT']=='RIEN','INGREDIENTS_TEXT']=np.nan
#Recherche de l'individu dont le produit est décrit en 7731 caractères
#Traduction dans toutes les langues !
#df[df['LEN_INGREDIENTS']==df['LEN_INGREDIENTS'].max()].INGREDIENTS_TEXT.values
df['INGREDIENTS_TEXT'].str.split('.')
#df['INGREDIENTS_TEXT'].str[0]
4 [NAN]
5 [NAN]
11 [EAU GRAINES DE TÉGUMENTS DE MOUTARDE VINAIGRE...
12 [NAN]
13 [NAN]
...
2718722 [NAN]
2718725 [NAN]
2718729 [VIANDE HACHEE PUR BOEUF 5% M,G, POURCENTAGE D...
2718733 [NAN]
2718736 [NAN]
Name: INGREDIENTS_TEXT, Length: 917713, dtype: object
df['INGREDIENTS_TEXT'] = df['INGREDIENTS_TEXT'].str.split('.')
df['INGREDIENTS_TEXT'] = df['INGREDIENTS_TEXT'].str[0]
df['INGREDIENTS_TEXT']
4 NAN
5 NAN
11 EAU GRAINES DE TÉGUMENTS DE MOUTARDE VINAIGRE ...
12 NAN
13 NAN
...
2718722 NAN
2718725 NAN
2718729 VIANDE HACHEE PUR BOEUF 5% M,G, POURCENTAGE DE...
2718733 NAN
2718736 NAN
Name: INGREDIENTS_TEXT, Length: 917713, dtype: object
df['LEN_INGREDIENTS'] = df['INGREDIENTS_TEXT'].str.len()
df['LEN_INGREDIENTS'].describe()
count 917645.000000 mean 56.000264 std 129.608578 min 0.000000 25% 3.000000 50% 3.000000 75% 35.000000 max 5561.000000 Name: LEN_INGREDIENTS, dtype: float64
df.loc[df['LEN_INGREDIENTS'] <= 2,'INGREDIENTS_TEXT']=np.nan
Les valeurs des variables listées ci-dessous devrait être comprise entre 0 et 100.
La valeur max de energy-kcal_100g et de 900kcal d'après la littérature. Ce qui correspond au maximum de calorie générée par 100g de graisse.
J'utilise ma fonction val_abberante pour affecter aux valeurs abberantes la valeur np.nan
var_quant = df.dtypes[df.dtypes == float].index
var_quant
var_quant_100 = var_quant.drop(['PRODUCT_QUANTITY',
'ADDITIVES_N',
'ENERGY-KCAL_100G',
'LEN_INGREDIENTS'])
var_quant_100
Index(['SUGARS_100G', 'SATURATED-FAT_100G', 'FIBER_100G', 'SALT_100G',
'CARBOHYDRATES_100G', 'PROTEINS_100G'],
dtype='object')
fig, ax = plt.subplots(2,3,figsize=(12,15),dpi= 150)
#,title = 'Valeurs abbérantes quantitatives' )
sns.set()
for i,var in enumerate(var_quant_100):
if i < 3:
sns.boxplot(df[var], ax = ax[0,i])
else :
#print(i)
sns.boxplot(df[var], ax = ax[1,i-3])
plt.savefig('P3_32a.png',dpi=150)
for var in var_quant_100:
print(var)
df = clean.val_aberrante(df, var,0.,100.)
SUGARS_100G SATURATED-FAT_100G SATURATED-FAT_100G Pas de valeurs < 0.0 FIBER_100G FIBER_100G Pas de valeurs < 0.0 SALT_100G SALT_100G Pas de valeurs < 0.0 CARBOHYDRATES_100G CARBOHYDRATES_100G Pas de valeurs < 0.0 PROTEINS_100G
fig, ax = plt.subplots(1, 2,figsize=(15,7),dpi =150)
sns.set()
sns.boxplot(data = df[var_quant_100], orient ='v', ax = ax[0])
ax[0].tick_params(labelrotation = 90)
ax[0].set_title('Visualisation de la dispersion des variables comprises entre 0 et 100 \n (avec valeurs extrêmes)')
sns.boxplot(data = df[var_quant_100], orient ='v',ax = ax[1],showfliers=False)
ax[1].tick_params(labelrotation = 90)
ax[1].set_title('Visualisation de la dispersion des variables comprises entre 0 et 100 \n (sans valeurs extrêmes)')
plt.savefig('P3_32b.png',dpi = 150,bbox_inches = 'tight')
df['PRODUCT_QUANTITY'].describe()
count 2.938090e+05 mean 3.063225e+13 std 1.660389e+16 min 0.000000e+00 25% 1.600000e+02 50% 3.000000e+02 75% 5.000000e+02 max 9.000000e+18 Name: PRODUCT_QUANTITY, dtype: float64
sns.set()
sns.boxplot(df['PRODUCT_QUANTITY'])
<AxesSubplot:xlabel='PRODUCT_QUANTITY'>
Je ne conserve que les quantités inférieures ou égale à 5kg et supérieur à 10g. J'estime que les autres valeurs sont abbérantes et les remplacent par des valeurs nulles.
df.loc[df['PRODUCT_QUANTITY']> 5e3,'PRODUCT_QUANTITY']= np.nan
df.loc[df['PRODUCT_QUANTITY'] < 10,'PRODUCT_QUANTITY']= np.nan
sns.set()
sns.displot(df['PRODUCT_QUANTITY'], kind = 'kde')
<seaborn.axisgrid.FacetGrid at 0x7f918ab86400>
Je borne l'énergie entre 0 kcal pour l'eau et 900 kcal pour les huiles.
df['ENERGY-KCAL_100G'].describe()
count 703003.000000 mean 276.250341 std 204.341596 min 0.000000 25% 112.000000 50% 263.000000 75% 400.000000 max 31000.000000 Name: ENERGY-KCAL_100G, dtype: float64
col = 'ENERGY-KCAL_100G'
df = clean.val_aberrante(df,col ,0.,900.)
fig, ax = plt.subplots(figsize=(4,5))
sns.set()
sns.boxplot(df[col], orient ='v',showfliers=False)
ax.set_xlabel(col)
ENERGY-KCAL_100G Pas de valeurs < 0.0
Text(0.5, 0, 'ENERGY-KCAL_100G')
df['ADDITIVES_N'].describe()
count 273189.000000 mean 1.572263 std 2.331602 min 0.000000 25% 0.000000 50% 1.000000 75% 2.000000 max 32.000000 Name: ADDITIVES_N, dtype: float64
df.to_csv('/Users/JoycyRobert/Documents/JUPYTER/OPC/PROJET3/P3/df_additives.csv',index=False)
df = pd.read_csv('df_additives.csv',
dtype={'NOVA_GROUP': 'category',
'NUTRISCORE_GRADE': 'category',
'CODE':'object',
'PNNS_GROUPS_1':'category'
})
Le valeurs manquantes prennent plusieurs valeurs, 'SANS MARQUE', '', etc... Je cherche à leur affecter la valeur np.nan pour d'avantage de clarté.
#Remplacement des NAN et UNKNOWN par des np.nan
df.loc[df['BRANDS']=='NAN','BRANDS'] = np.nan
df.loc[df['BRANDS']== 'SANS MARQUE', 'BRANDS'] = np.nan
df.loc[df['BRANDS'].str.contains('SANS MARQUE').fillna(False), 'BRANDS'] = np.nan
df.loc[df['PRODUCT_NAME']=='NAN','PRODUCT_NAME'] = np.nan
df.loc[df['PRODUCT_NAME']=='LOADING...','PRODUCT_NAME'] = np.nan
df.loc[df['INGREDIENTS_TEXT']=='NAN','INGREDIENTS_TEXT'] = np.nan
#df
Utilisation de la fonction vide de mon module clean. La fonction remplace les cases vide type '', par np.nan
var_qual = df.dtypes.loc[df.dtypes == object].index
df = clean.vide(df,var_qual)
Aucune case vide, variable CODE traitée Aucune case vide, variable URL traitée Aucune case vide, variable PRODUCT_NAME traitée Aucune case vide, variable BRANDS traitée Aucune case vide, variable INGREDIENTS_TEXT traitée
#Suppression des produits sans dénomination
print('Nombre de produits sans dénomination',df[(df['PRODUCT_NAME'].isna())].shape[0])
df = df[~(df['PRODUCT_NAME'].isna())]
df.shape
Nombre de produits sans dénomination 7595
(910118, 18)
#Découverte de nouvelles abbérations, les produits non comestibles
ind = df[df['BRANDS']=='DOVE'].index
df = df.drop(ind)
ind = df[df['BRANDS']=='MONSAVON AU LAIT'].index
df = df.drop(ind)
#Supression des réferences qui n'ont pas de code barres
print('Nombre de produits sans code barre',df[~df['CODE'].str.startswith('200')].shape[0])
df = df[~df['CODE'].str.startswith('200')]
df.shape
Nombre de produits sans code barre 896196
(896196, 18)
#Les produits ayant le même code bar
df[df.duplicated('CODE',keep = False)]
| CODE | URL | PRODUCT_NAME | BRANDS | PNNS_GROUPS_1 | PRODUCT_QUANTITY | INGREDIENTS_TEXT | SUGARS_100G | SATURATED-FAT_100G | FIBER_100G | SALT_100G | ADDITIVES_N | ENERGY-KCAL_100G | CARBOHYDRATES_100G | PROTEINS_100G | NOVA_GROUP | NUTRISCORE_GRADE | LEN_INGREDIENTS | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2732 | 14100097006 | https://world.openfoodfacts.org/product/001410... | CHOCOLATE CHUNK MILK | CHOCOLATE CHUNK PEPPERIDGE | Sucreries | 204.0 | FARINE DE _BLÉ_ ENRICHIE (FARINE, NIACINE, FER... | 33.0 | 15.0 | 4.0 | 0.77 | 4.0 | 525.0 | 58.0 | 5.6 | 4 | e | 616.0 |
| 2766 | 14113230254 | https://world.openfoodfacts.org/product/001411... | ALMONDS | WONDERFUL | Collation salée | 100.0 | NaN | 4.4 | 3.8 | NaN | 0.00 | NaN | 596.0 | 9.1 | 21.0 | NaN | b | 3.0 |
| 2772 | 14113910569 | https://world.openfoodfacts.org/product/001411... | PISTACHIOS | WONDERFUL | Collation salée | 250.0 | PISTACHES (99 | 7.8 | 5.6 | NaN | 0.65 | 0.0 | 590.0 | 17.4 | 21.4 | 3 | c | 13.0 |
| 2775 | 14113910811 | https://world.openfoodfacts.org/product/001411... | PISTACHES | WONDERFUL | Collation salée | 450.0 | PISTACHES | 7.7 | 5.6 | 10.1 | 0.00 | 0.0 | 590.0 | 18.0 | 21.0 | 1 | a | 9.0 |
| 2916 | 15869005134 | https://world.openfoodfacts.org/product/001586... | PAVE DE SAUMON | BOFROST | Viandes | NaN | NaN | 0.1 | 3.1 | 0.1 | 0.15 | NaN | 202.0 | 0.1 | 20.4 | NaN | b | 3.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 916309 | 96619474134 | https://world.openfoodfacts.org/product/966194... | STEAK STRIPS | NaN | NaN | NaN | NaN | 15.9 | 1.3 | NaN | 3.80 | NaN | 234.0 | 15.9 | 34.4 | NaN | NaN | 3.0 |
| 916310 | 96619512614 | https://world.openfoodfacts.org/product/966195... | EAU DE COCO | NaN | NaN | NaN | NaN | 3.8 | 0.0 | NaN | 0.04 | NaN | 18.0 | 4.4 | 0.0 | NaN | d | 3.0 |
| 916383 | 97069001550 | https://world.openfoodfacts.org/product/970690... | KUNTERBUNT | NaN | NaN | NaN | NaN | 16.2 | 0.1 | NaN | 0.01 | NaN | 77.0 | 17.6 | 0.2 | NaN | NaN | 3.0 |
| 916384 | 97069051166 | https://world.openfoodfacts.org/product/970690... | COUPE FRAISE VANILLE | NaN | NaN | NaN | NaN | 25.0 | 6.5 | NaN | 0.05 | NaN | 192.0 | 30.0 | 1.6 | NaN | NaN | 3.0 |
| 916464 | 97643090345 | https://world.openfoodfacts.org/product/976430... | ROSEMARIE MILK | NaN | NaN | NaN | NaN | 6.0 | 16.0 | NaN | 0.13 | NaN | 505.0 | 49.0 | 7.0 | NaN | NaN | 3.0 |
212 rows × 18 columns
df = clean.doublon(df,'CODE')
Recherche de doublon : il y a 212 doublons qui ont la même clé: CODE
df[df.duplicated(['PRODUCT_NAME','BRANDS','PRODUCT_QUANTITY'],keep = False)]
| CODE | URL | PRODUCT_NAME | BRANDS | PNNS_GROUPS_1 | PRODUCT_QUANTITY | INGREDIENTS_TEXT | SUGARS_100G | SATURATED-FAT_100G | FIBER_100G | SALT_100G | ADDITIVES_N | ENERGY-KCAL_100G | CARBOHYDRATES_100G | PROTEINS_100G | NOVA_GROUP | NUTRISCORE_GRADE | LEN_INGREDIENTS | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 252246 | 3222472334682 | https://world.openfoodfacts.org/product/322247... | COOKIES PÉPITES DE | CASINO | Sucreries | 200.0 | FARINE DE BLÉ - SUCRE - PÉPITES DE CHOCOLAT 15 | 30.0 | 5.0 | 4.1 | 0.50 | 4.0 | 517.0 | 62.0 | 6.8 | 4 | d | 46.0 |
| 252253 | 3222472344193 | https://world.openfoodfacts.org/product/322247... | POÊLÉE PAYSANNE POMMES | CASINO | Plats cuisinés | 900.0 | POMMES DE TERRE PRÉFRITES EN QUARTIERS 62,8% (... | 1.2 | 1.4 | 2.0 | 0.62 | 8.0 | 137.0 | 14.0 | 4.1 | 4 | b | 771.0 |
| 252278 | 3222472384090 | https://world.openfoodfacts.org/product/322247... | PETITS POTS DE | CASINO | Produits Laitiers | 400.0 | LAIT ENTIER - CRÈME 14 | 16.0 | 5.0 | 0.6 | 0.12 | 0.0 | 160.0 | 18.0 | 3.8 | 3 | c | 22.0 |
| 252279 | 3222472384120 | https://world.openfoodfacts.org/product/322247... | PETITS POTS DE | CASINO | Produits Laitiers | 400.0 | LAIT ENTIER - CRÈME 14,6% - SUCRE - CHOCOLAT 5... | 18.0 | 5.4 | 1.8 | 0.13 | 5.0 | 167.0 | 19.0 | 3.6 | 4 | c | 353.0 |
| 252276 | 3222472376422 | https://world.openfoodfacts.org/product/322247... | YAOURT À LA | CASINO | Produits Laitiers | 600.0 | _LAIT_ ENTIER 78 | 2.5 | 6.5 | 0.0 | 0.10 | 0.0 | 107.0 | 3.8 | 2.9 | 4 | c | 16.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 171454 | 2604276064810 | https://world.openfoodfacts.org/product/260427... | COTIS SALÉ | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 3.0 |
| 171433 | 26042176 | https://world.openfoodfacts.org/product/260421... | ROMARIN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 3.0 |
| 127912 | 2210178034112 | https://world.openfoodfacts.org/product/221017... | COMTE | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 3.0 |
| 45655 | 213402017457 | https://world.openfoodfacts.org/product/021340... | CUISSE DE POULET | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 3.0 |
| 772500 | 7029429988884 | https://world.openfoodfacts.org/product/702942... | FAUX MAGE CLASSIC | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 3.0 |
312766 rows × 18 columns
df = clean.doublon(df,['PRODUCT_NAME','BRANDS','PRODUCT_QUANTITY'])
df.shape
Recherche de doublon : il y a 312766 doublons qui ont la même clé: ['PRODUCT_NAME', 'BRANDS', 'PRODUCT_QUANTITY']
(650564, 18)
df = df.drop(columns = ['BRANDS','PRODUCT_QUANTITY'])
df.to_csv('/Users/JoycyRobert/Documents/JUPYTER/OPC/PROJET3/P3/df_export1.csv',index=False)
var_quant = df.dtypes[df.dtypes == float].index
var_quant
var_quant_100 = var_quant.drop([
'ADDITIVES_N',
'ENERGY-KCAL_100G',
'LEN_INGREDIENTS'])
var_quant_bis = var_quant.drop(['LEN_INGREDIENTS'])
Je suppose que :
les produits dont la désignation commence par SUCRE ou SIROP, sont à ranger dans la catégorie des sucreries et leur taux de surce est respectivement 100 et 70.
les produits dont la désignation commence par EAU ou Huile sont à ranger dans les catégories 'Boissons' et 'Matières grasses' leur calories sont respectivement 0kcal et 900kcal.
J'applique le même raisonnement pour le sel
J'ajoute une nouvelle catégorie aux PNNS_GROUPS_1 : les assaisonnements.
def imput_alim(df, var1,motcle,var2,valimput):
# 1. La fonction imput_alim recherche dans la var1 (str), les individus commençant par motcle (str)
# 2. Pour la sélection, la fonction impute valimput à la var2
print(df.loc[(df[var1].str.contains('^'+motcle) == True)].isna().mean())
df.loc[(df[var1].str.contains('^'+motcle) == True) & (df[var2].isna()),var2] = valimput
return df
## Pour le Sucre
pattern = ['SUCRE\s', 'SUGAR\s']
for i in pattern:
df = imput_alim(df,'PRODUCT_NAME',i,'SUGARS_100G',100)
df = imput_alim(df,'PRODUCT_NAME',i,'PNNS_GROUPS_1','Sucreries')
df = imput_alim(df,'INGREDIENTS_TEXT',i,'PNNS_GROUPS_1','Sucreries')
## Les sirops
pat = 'SIROP\s'
df = imput_alim(df,'PRODUCT_NAME',pat,'SUGARS_100G',70)
df = imput_alim(df,'PRODUCT_NAME',pat,'PNNS_GROUPS_1','Sucreries')
df = imput_alim(df,'INGREDIENTS_TEXT',pat,'PNNS_GROUPS_1','Sucreries')
CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.449318 INGREDIENTS_TEXT 0.554581 SUGARS_100G 0.159844 SATURATED-FAT_100G 0.165692 FIBER_100G 0.716374 SALT_100G 0.293372 ADDITIVES_N 0.552632 ENERGY-KCAL_100G 0.231969 CARBOHYDRATES_100G 0.213450 PROTEINS_100G 0.216374 NOVA_GROUP 0.490253 NUTRISCORE_GRADE 0.552632 LEN_INGREDIENTS 0.000000 dtype: float64 CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.449318 INGREDIENTS_TEXT 0.554581 SUGARS_100G 0.000000 SATURATED-FAT_100G 0.165692 FIBER_100G 0.716374 SALT_100G 0.293372 ADDITIVES_N 0.552632 ENERGY-KCAL_100G 0.231969 CARBOHYDRATES_100G 0.213450 PROTEINS_100G 0.216374 NOVA_GROUP 0.490253 NUTRISCORE_GRADE 0.552632 LEN_INGREDIENTS 0.000000 dtype: float64 CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.198402 INGREDIENTS_TEXT 0.000000 SUGARS_100G 0.035619 SATURATED-FAT_100G 0.048935 FIBER_100G 0.455393 SALT_100G 0.051598 ADDITIVES_N 0.000000 ENERGY-KCAL_100G 0.108189 CARBOHYDRATES_100G 0.038615 PROTEINS_100G 0.042943 NOVA_GROUP 0.079561 NUTRISCORE_GRADE 0.294274 LEN_INGREDIENTS 0.000000 dtype: float64 CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.669565 INGREDIENTS_TEXT 0.634783 SUGARS_100G 0.086957 SATURATED-FAT_100G 0.078261 FIBER_100G 0.686957 SALT_100G 0.173913 ADDITIVES_N 0.634783 ENERGY-KCAL_100G 0.147826 CARBOHYDRATES_100G 0.121739 PROTEINS_100G 0.121739 NOVA_GROUP 0.669565 NUTRISCORE_GRADE 0.739130 LEN_INGREDIENTS 0.000000 dtype: float64 CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.669565 INGREDIENTS_TEXT 0.634783 SUGARS_100G 0.000000 SATURATED-FAT_100G 0.078261 FIBER_100G 0.686957 SALT_100G 0.173913 ADDITIVES_N 0.634783 ENERGY-KCAL_100G 0.147826 CARBOHYDRATES_100G 0.121739 PROTEINS_100G 0.121739 NOVA_GROUP 0.669565 NUTRISCORE_GRADE 0.739130 LEN_INGREDIENTS 0.000000 dtype: float64 CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.416667 INGREDIENTS_TEXT 0.000000 SUGARS_100G 0.027778 SATURATED-FAT_100G 0.027778 FIBER_100G 0.500000 SALT_100G 0.027778 ADDITIVES_N 0.000000 ENERGY-KCAL_100G 0.027778 CARBOHYDRATES_100G 0.027778 PROTEINS_100G 0.027778 NOVA_GROUP 0.333333 NUTRISCORE_GRADE 0.500000 LEN_INGREDIENTS 0.000000 dtype: float64 CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.380237 INGREDIENTS_TEXT 0.533520 SUGARS_100G 0.120112 SATURATED-FAT_100G 0.135126 FIBER_100G 0.718575 SALT_100G 0.208450 ADDITIVES_N 0.533170 ENERGY-KCAL_100G 0.166899 CARBOHYDRATES_100G 0.135126 PROTEINS_100G 0.146299 NOVA_GROUP 0.566341 NUTRISCORE_GRADE 0.774092 LEN_INGREDIENTS 0.000000 dtype: float64 CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.380237 INGREDIENTS_TEXT 0.533520 SUGARS_100G 0.000000 SATURATED-FAT_100G 0.135126 FIBER_100G 0.718575 SALT_100G 0.208450 ADDITIVES_N 0.533170 ENERGY-KCAL_100G 0.166899 CARBOHYDRATES_100G 0.135126 PROTEINS_100G 0.146299 NOVA_GROUP 0.566341 NUTRISCORE_GRADE 0.774092 LEN_INGREDIENTS 0.000000 dtype: float64 CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.258900 INGREDIENTS_TEXT 0.000000 SUGARS_100G 0.036812 SATURATED-FAT_100G 0.048139 FIBER_100G 0.565939 SALT_100G 0.059871 ADDITIVES_N 0.000000 ENERGY-KCAL_100G 0.123786 CARBOHYDRATES_100G 0.044498 PROTEINS_100G 0.046521 NOVA_GROUP 0.033576 NUTRISCORE_GRADE 0.314725 LEN_INGREDIENTS 0.000000 dtype: float64
df.loc[(df['SUGARS_100G']==100), 'ENERGY-KCAL_100G'] = 400
df.loc[(df['SUGARS_100G']==100), 'CARBOHYDRATES_100G'] = 100
df.loc[(df['SUGARS_100G']==100), ['SATURATED-FAT_100G', 'FIBER_100G', 'SALT_100G', 'PROTEINS_100G']] = 0
## Pour les huiles et beurres
pattern = ['HUILE\s','OIL\s','BEURRE\s']
for i in pattern:
df = imput_alim(df,'PRODUCT_NAME',i,'ENERGY-KCAL_100G',900)
df = imput_alim(df,'PRODUCT_NAME',i,'PNNS_GROUPS_1','Matières grasses et sauces')
df = imput_alim(df,'INGREDIENTS_TEXT',i,'PNNS_GROUPS_1','Matières grasses et sauces')
## Pour l'eau
pattern= ['EAU\s','WATER\s']
for i in pattern:
df = imput_alim(df,'PRODUCT_NAME',i,'ENERGY-KCAL_100G',0)
df = imput_alim(df,'PRODUCT_NAME',i,'PNNS_GROUPS_1','Boissons')
CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.187604 INGREDIENTS_TEXT 0.524030 SUGARS_100G 0.185946 SATURATED-FAT_100G 0.174677 FIBER_100G 0.677163 SALT_100G 0.232019 ADDITIVES_N 0.522705 ENERGY-KCAL_100G 0.203513 CARBOHYDRATES_100G 0.180974 PROTEINS_100G 0.180312 NOVA_GROUP 0.343719 NUTRISCORE_GRADE 0.320517 LEN_INGREDIENTS 0.000000 dtype: float64 CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.187604 INGREDIENTS_TEXT 0.524030 SUGARS_100G 0.185946 SATURATED-FAT_100G 0.174677 FIBER_100G 0.677163 SALT_100G 0.232019 ADDITIVES_N 0.522705 ENERGY-KCAL_100G 0.000000 CARBOHYDRATES_100G 0.180974 PROTEINS_100G 0.180312 NOVA_GROUP 0.343719 NUTRISCORE_GRADE 0.320517 LEN_INGREDIENTS 0.000000 dtype: float64 CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.093770 INGREDIENTS_TEXT 0.000000 SUGARS_100G 0.052023 SATURATED-FAT_100G 0.047848 FIBER_100G 0.549133 SALT_100G 0.057482 ADDITIVES_N 0.000000 ENERGY-KCAL_100G 0.093770 CARBOHYDRATES_100G 0.047206 PROTEINS_100G 0.046885 NOVA_GROUP 0.135517 NUTRISCORE_GRADE 0.131021 LEN_INGREDIENTS 0.000000 dtype: float64 CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.571429 INGREDIENTS_TEXT 1.000000 SUGARS_100G 0.285714 SATURATED-FAT_100G 0.285714 FIBER_100G 0.857143 SALT_100G 0.571429 ADDITIVES_N 1.000000 ENERGY-KCAL_100G 0.285714 CARBOHYDRATES_100G 0.285714 PROTEINS_100G 0.285714 NOVA_GROUP 0.571429 NUTRISCORE_GRADE 0.857143 LEN_INGREDIENTS 0.000000 dtype: float64 CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.571429 INGREDIENTS_TEXT 1.000000 SUGARS_100G 0.285714 SATURATED-FAT_100G 0.285714 FIBER_100G 0.857143 SALT_100G 0.571429 ADDITIVES_N 1.000000 ENERGY-KCAL_100G 0.000000 CARBOHYDRATES_100G 0.285714 PROTEINS_100G 0.285714 NOVA_GROUP 0.571429 NUTRISCORE_GRADE 0.857143 LEN_INGREDIENTS 0.000000 dtype: float64 CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.000000 INGREDIENTS_TEXT 0.000000 SUGARS_100G 0.000000 SATURATED-FAT_100G 0.000000 FIBER_100G 0.333333 SALT_100G 0.000000 ADDITIVES_N 0.000000 ENERGY-KCAL_100G 0.000000 CARBOHYDRATES_100G 0.000000 PROTEINS_100G 0.000000 NOVA_GROUP 0.666667 NUTRISCORE_GRADE 0.000000 LEN_INGREDIENTS 0.000000 dtype: float64 CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.231301 INGREDIENTS_TEXT 0.448146 SUGARS_100G 0.191075 SATURATED-FAT_100G 0.191703 FIBER_100G 0.695160 SALT_100G 0.230044 ADDITIVES_N 0.446260 ENERGY-KCAL_100G 0.223759 CARBOHYDRATES_100G 0.191703 PROTEINS_100G 0.187304 NOVA_GROUP 0.329353 NUTRISCORE_GRADE 0.336266 LEN_INGREDIENTS 0.000000 dtype: float64 CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.231301 INGREDIENTS_TEXT 0.448146 SUGARS_100G 0.191075 SATURATED-FAT_100G 0.191703 FIBER_100G 0.695160 SALT_100G 0.230044 ADDITIVES_N 0.446260 ENERGY-KCAL_100G 0.000000 CARBOHYDRATES_100G 0.191703 PROTEINS_100G 0.187304 NOVA_GROUP 0.329353 NUTRISCORE_GRADE 0.336266 LEN_INGREDIENTS 0.000000 dtype: float64 CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.090090 INGREDIENTS_TEXT 0.000000 SUGARS_100G 0.051802 SATURATED-FAT_100G 0.054054 FIBER_100G 0.635135 SALT_100G 0.056306 ADDITIVES_N 0.000000 ENERGY-KCAL_100G 0.049550 CARBOHYDRATES_100G 0.056306 PROTEINS_100G 0.049550 NOVA_GROUP 0.123874 NUTRISCORE_GRADE 0.137387 LEN_INGREDIENTS 0.000000 dtype: float64 CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.445912 INGREDIENTS_TEXT 0.586292 SUGARS_100G 0.338563 SATURATED-FAT_100G 0.337737 FIBER_100G 0.731627 SALT_100G 0.348472 ADDITIVES_N 0.585467 ENERGY-KCAL_100G 0.355904 CARBOHYDRATES_100G 0.340215 PROTEINS_100G 0.342692 NOVA_GROUP 0.547481 NUTRISCORE_GRADE 0.430223 LEN_INGREDIENTS 0.000000 dtype: float64 CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.445912 INGREDIENTS_TEXT 0.586292 SUGARS_100G 0.338563 SATURATED-FAT_100G 0.337737 FIBER_100G 0.731627 SALT_100G 0.348472 ADDITIVES_N 0.585467 ENERGY-KCAL_100G 0.000000 CARBOHYDRATES_100G 0.340215 PROTEINS_100G 0.342692 NOVA_GROUP 0.547481 NUTRISCORE_GRADE 0.430223 LEN_INGREDIENTS 0.000000 dtype: float64 CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.736111 INGREDIENTS_TEXT 0.763889 SUGARS_100G 0.125000 SATURATED-FAT_100G 0.138889 FIBER_100G 0.708333 SALT_100G 0.222222 ADDITIVES_N 0.763889 ENERGY-KCAL_100G 0.152778 CARBOHYDRATES_100G 0.138889 PROTEINS_100G 0.138889 NOVA_GROUP 0.833333 NUTRISCORE_GRADE 0.694444 LEN_INGREDIENTS 0.000000 dtype: float64 CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.736111 INGREDIENTS_TEXT 0.763889 SUGARS_100G 0.125000 SATURATED-FAT_100G 0.138889 FIBER_100G 0.708333 SALT_100G 0.222222 ADDITIVES_N 0.763889 ENERGY-KCAL_100G 0.000000 CARBOHYDRATES_100G 0.138889 PROTEINS_100G 0.138889 NOVA_GROUP 0.833333 NUTRISCORE_GRADE 0.694444 LEN_INGREDIENTS 0.000000 dtype: float64
df.loc[(df['PRODUCT_NAME'].str.contains('^EAU\s') == True)][var_quant_bis].fillna(0)
df.loc[(df['PRODUCT_NAME'].str.contains('^EAU\s') == True)]['PNNS_GROUPS_1'].fillna('Boissons')
294154 Boissons
297050 Boissons
297476 Boissons
294583 Boissons
296762 Boissons
...
726738 Boissons
713167 Boissons
415629 Boissons
910166 Boissons
243112 Boissons
Name: PNNS_GROUPS_1, Length: 1211, dtype: category
Categories (9, object): ['Boissons', 'Collation salée', 'Céréales et pomme de terre', 'Fruits et légumes', ..., 'Plats cuisinés', 'Produits Laitiers', 'Sucreries', 'Viandes']
#Ajout d'une nouvelle catégorie
df['PNNS_GROUPS_1'] = df['PNNS_GROUPS_1'].cat.add_categories(['Assaisonnements'])
df['PNNS_GROUPS_1'].unique()
['Céréales et pomme de terre', 'Viandes', 'Produits Laitiers', 'Sucreries', 'Plats cuisinés', 'Collation salée', 'Fruits et légumes', 'Matières grasses et sauces', 'Boissons', NaN] Categories (10, object): ['Boissons', 'Collation salée', 'Céréales et pomme de terre', 'Fruits et légumes', ..., 'Produits Laitiers', 'Sucreries', 'Viandes', 'Assaisonnements']
## Les sels
pattern = ['SEL\s','SALT\s']
for i in pattern:
df = imput_alim(df,'PRODUCT_NAME',i,'SALT_100G',100)
df = imput_alim(df,'PRODUCT_NAME',i,'PNNS_GROUPS_1','Assaisonnements')
df = imput_alim(df,'INGREDIENTS_TEXT',i,'PNNS_GROUPS_1','Assaisonnements')
CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.991706 INGREDIENTS_TEXT 0.706161 SUGARS_100G 0.612559 SATURATED-FAT_100G 0.612559 FIBER_100G 0.870853 SALT_100G 0.598341 ADDITIVES_N 0.704976 ENERGY-KCAL_100G 0.574645 CARBOHYDRATES_100G 0.616114 PROTEINS_100G 0.614929 NOVA_GROUP 0.612559 NUTRISCORE_GRADE 0.978673 LEN_INGREDIENTS 0.000000 dtype: float64 CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.991706 INGREDIENTS_TEXT 0.706161 SUGARS_100G 0.612559 SATURATED-FAT_100G 0.612559 FIBER_100G 0.870853 SALT_100G 0.000000 ADDITIVES_N 0.704976 ENERGY-KCAL_100G 0.574645 CARBOHYDRATES_100G 0.616114 PROTEINS_100G 0.614929 NOVA_GROUP 0.612559 NUTRISCORE_GRADE 0.978673 LEN_INGREDIENTS 0.000000 dtype: float64 CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.618304 INGREDIENTS_TEXT 0.000000 SUGARS_100G 0.229911 SATURATED-FAT_100G 0.229911 FIBER_100G 0.535714 SALT_100G 0.113839 ADDITIVES_N 0.000000 ENERGY-KCAL_100G 0.245536 CARBOHYDRATES_100G 0.223214 PROTEINS_100G 0.223214 NOVA_GROUP 0.178571 NUTRISCORE_GRADE 0.834821 LEN_INGREDIENTS 0.000000 dtype: float64 CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.789474 INGREDIENTS_TEXT 0.771930 SUGARS_100G 0.210526 SATURATED-FAT_100G 0.210526 FIBER_100G 0.789474 SALT_100G 0.228070 ADDITIVES_N 0.771930 ENERGY-KCAL_100G 0.228070 CARBOHYDRATES_100G 0.228070 PROTEINS_100G 0.228070 NOVA_GROUP 0.789474 NUTRISCORE_GRADE 0.736842 LEN_INGREDIENTS 0.000000 dtype: float64 CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.789474 INGREDIENTS_TEXT 0.771930 SUGARS_100G 0.210526 SATURATED-FAT_100G 0.210526 FIBER_100G 0.789474 SALT_100G 0.000000 ADDITIVES_N 0.771930 ENERGY-KCAL_100G 0.228070 CARBOHYDRATES_100G 0.228070 PROTEINS_100G 0.228070 NOVA_GROUP 0.789474 NUTRISCORE_GRADE 0.736842 LEN_INGREDIENTS 0.000000 dtype: float64 CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.727273 INGREDIENTS_TEXT 0.000000 SUGARS_100G 0.090909 SATURATED-FAT_100G 0.090909 FIBER_100G 0.636364 SALT_100G 0.090909 ADDITIVES_N 0.000000 ENERGY-KCAL_100G 0.090909 CARBOHYDRATES_100G 0.090909 PROTEINS_100G 0.090909 NOVA_GROUP 0.090909 NUTRISCORE_GRADE 0.727273 LEN_INGREDIENTS 0.000000 dtype: float64
df.loc[df['SALT_100G']==100, ['SUGARS_100G', 'SATURATED-FAT_100G', 'FIBER_100G','ENERGY-KCAL_100G', 'CARBOHYDRATES_100G',
'PROTEINS_100G', 'LEN_INGREDIENTS']] = 0
pattern = 'POIVRE\s'
df = imput_alim(df,'PRODUCT_NAME',pattern,'PNNS_GROUPS_1','Assaisonnements')
CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.993103 INGREDIENTS_TEXT 0.672414 SUGARS_100G 0.686207 SATURATED-FAT_100G 0.684483 FIBER_100G 0.922414 SALT_100G 0.732759 ADDITIVES_N 0.672414 ENERGY-KCAL_100G 0.686207 CARBOHYDRATES_100G 0.684483 PROTEINS_100G 0.684483 NOVA_GROUP 0.775862 NUTRISCORE_GRADE 0.991379 LEN_INGREDIENTS 0.000000 dtype: float64
var_quant
Index(['SUGARS_100G', 'SATURATED-FAT_100G', 'FIBER_100G', 'SALT_100G',
'ADDITIVES_N', 'ENERGY-KCAL_100G', 'CARBOHYDRATES_100G',
'PROTEINS_100G', 'LEN_INGREDIENTS'],
dtype='object')
clean.vm(df,0.001,'P3_41a.png')
CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.542803 INGREDIENTS_TEXT 0.621465 SUGARS_100G 0.176430 SATURATED-FAT_100G 0.177163 FIBER_100G 0.716349 SALT_100G 0.217716 ADDITIVES_N 0.620605 ENERGY-KCAL_100G 0.203998 CARBOHYDRATES_100G 0.181467 PROTEINS_100G 0.179097 NOVA_GROUP 0.665526 NUTRISCORE_GRADE 0.586254 LEN_INGREDIENTS 0.000078 dtype: float64
var_quant_100
Index(['SUGARS_100G', 'SATURATED-FAT_100G', 'FIBER_100G', 'SALT_100G',
'CARBOHYDRATES_100G', 'PROTEINS_100G'],
dtype='object')
df['PNNS_GROUPS_1'] = df['PNNS_GROUPS_1'].cat.add_categories(['Cat. Inconnue'])
df.loc[df['PNNS_GROUPS_1'].isna(),'PNNS_GROUPS_1'] = 'Cat. Inconnue'
df['PNNS_GROUPS_1'].unique()
['Céréales et pomme de terre', 'Viandes', 'Produits Laitiers', 'Sucreries', 'Plats cuisinés', ..., 'Fruits et légumes', 'Matières grasses et sauces', 'Boissons', 'Cat. Inconnue', 'Assaisonnements'] Length: 11 Categories (11, object): ['Boissons', 'Collation salée', 'Céréales et pomme de terre', 'Fruits et légumes', ..., 'Sucreries', 'Viandes', 'Assaisonnements', 'Cat. Inconnue']
ind = pd.Index(df['PNNS_GROUPS_1'].unique())
df_VM = pd.DataFrame(columns = ind, index =var_quant_bis)
for i,values in enumerate(df['PNNS_GROUPS_1'].unique()):
df_VM[values] = df[df['PNNS_GROUPS_1']==values][var_quant_bis].isna().sum()
df_VM = df_VM.T
df_VM['TOTAL'] = df_VM.sum(axis=1)
df_VM
| SUGARS_100G | SATURATED-FAT_100G | FIBER_100G | SALT_100G | ADDITIVES_N | ENERGY-KCAL_100G | CARBOHYDRATES_100G | PROTEINS_100G | TOTAL | |
|---|---|---|---|---|---|---|---|---|---|
| Céréales et pomme de terre | 2114 | 2180 | 12253 | 2991 | 10509 | 3510 | 2053 | 1988 | 37598 |
| Viandes | 4913 | 4873 | 27492 | 5044 | 16387 | 7175 | 4843 | 4693 | 75420 |
| Produits Laitiers | 2442 | 2461 | 22540 | 2818 | 11566 | 4597 | 2389 | 2281 | 51094 |
| Sucreries | 15616 | 15715 | 51257 | 17394 | 30402 | 19326 | 16421 | 16335 | 182466 |
| Plats cuisinés | 1306 | 1345 | 12791 | 1439 | 8328 | 3684 | 1306 | 1231 | 31430 |
| Collation salée | 1016 | 1021 | 9914 | 1184 | 6144 | 2051 | 1023 | 973 | 23326 |
| Fruits et légumes | 4033 | 4175 | 11896 | 4687 | 8820 | 5276 | 3995 | 3951 | 46833 |
| Matières grasses et sauces | 2378 | 2375 | 15090 | 2828 | 9911 | 2392 | 2320 | 2284 | 39578 |
| Boissons | 2680 | 3050 | 14729 | 3492 | 5901 | 3462 | 2792 | 2902 | 39008 |
| Cat. Inconnue | 77812 | 77593 | 287189 | 99286 | 294754 | 80770 | 80450 | 79412 | 1077266 |
| Assaisonnements | 469 | 468 | 880 | 475 | 1021 | 471 | 464 | 464 | 4712 |
for i,row in df_VM.iterrows():
df_VM.loc[i] = (df_VM.loc[i]/df_VM.loc[i,'TOTAL'])*100
df_VM
| SUGARS_100G | SATURATED-FAT_100G | FIBER_100G | SALT_100G | ADDITIVES_N | ENERGY-KCAL_100G | CARBOHYDRATES_100G | PROTEINS_100G | TOTAL | |
|---|---|---|---|---|---|---|---|---|---|
| Céréales et pomme de terre | 5.622640 | 5.798181 | 32.589499 | 7.955210 | 27.950955 | 9.335603 | 5.460397 | 5.287515 | 100 |
| Viandes | 6.514187 | 6.461151 | 36.451870 | 6.687881 | 21.727658 | 9.513392 | 6.421374 | 6.222487 | 100 |
| Produits Laitiers | 4.779426 | 4.816613 | 44.114769 | 5.515325 | 22.636709 | 8.997143 | 4.675696 | 4.464321 | 100 |
| Sucreries | 8.558307 | 8.612563 | 28.091261 | 9.532735 | 16.661734 | 10.591562 | 8.999485 | 8.952353 | 100 |
| Plats cuisinés | 4.155266 | 4.279351 | 40.696787 | 4.578428 | 26.496977 | 11.721285 | 4.155266 | 3.916640 | 100 |
| Collation salée | 4.355655 | 4.377090 | 42.501929 | 5.075881 | 26.339707 | 8.792763 | 4.385664 | 4.171311 | 100 |
| Fruits et légumes | 8.611449 | 8.914654 | 25.400893 | 10.007900 | 18.832874 | 11.265561 | 8.530310 | 8.436359 | 100 |
| Matières grasses et sauces | 6.008388 | 6.000809 | 38.127242 | 7.145384 | 25.041690 | 6.043762 | 5.861842 | 5.770883 | 100 |
| Boissons | 6.870386 | 7.818909 | 37.758921 | 8.952010 | 15.127666 | 8.875103 | 7.157506 | 7.439500 | 100 |
| Cat. Inconnue | 7.223100 | 7.202771 | 26.659061 | 9.216479 | 27.361302 | 7.497684 | 7.467979 | 7.371624 | 100 |
| Assaisonnements | 9.953311 | 9.932088 | 18.675722 | 10.080645 | 21.668081 | 9.995756 | 9.847199 | 9.847199 | 100 |
fig, ax = plt.subplots(figsize=(12,8), dpi= 150)
sns.heatmap(data = df_VM[var_quant_bis],annot=True, cmap="coolwarm", fmt="0.2f")
plt.savefig('P3_42a.png',bbox_inches = 'tight')
Remarques :
#df['FIBER_100G'] = df['FIBER_100G'].fillna(0)
df.loc[(df['PNNS_GROUPS_1']!='Fruits et légumes') &
(df['PNNS_GROUPS_1']!='Plats cuisinés') &
(df['PNNS_GROUPS_1']!='Céréales') &
(df['PNNS_GROUPS_1']!='Cat. Inconnue')
,'FIBER_100G'] = df.loc[(df['PNNS_GROUPS_1']!='Fruits et légumes') &
(df['PNNS_GROUPS_1']!='Plats cuisinés') &
(df['PNNS_GROUPS_1']!='Céréales') &
(df['PNNS_GROUPS_1']!='Cat. Inconnue')
,'FIBER_100G'].fillna(0)
Passage de 72% de valeur manquantes pour les fibres à 49%
df.isna().mean()
CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.000000 INGREDIENTS_TEXT 0.621465 SUGARS_100G 0.176430 SATURATED-FAT_100G 0.177163 FIBER_100G 0.479393 SALT_100G 0.217716 ADDITIVES_N 0.620605 ENERGY-KCAL_100G 0.203998 CARBOHYDRATES_100G 0.181467 PROTEINS_100G 0.179097 NOVA_GROUP 0.665526 NUTRISCORE_GRADE 0.586254 LEN_INGREDIENTS 0.000078 dtype: float64
Pour la variable SALT_100G, la distribution des valeurs est très peu dispersé autour de la médiane. Je choisie donc de remplacer les valeurs manquantes par les valeurs médianes.
print(df['SALT_100G'].describe())
med = df['SALT_100G'].describe()['50%']
df['SALT_100G'] = df['SALT_100G'].fillna(med)
count 508926.000000 mean 1.338585 std 5.376102 min 0.000000 25% 0.050000 50% 0.500000 75% 1.275000 max 100.000000 Name: SALT_100G, dtype: float64
Je choisie de garder les produits qui possèdent au moins deux informations nutritionnelles pour pouvoir obtenir des réusltats convenables à l'issu des imputations.
var_quant_bis
Index(['SUGARS_100G', 'SATURATED-FAT_100G', 'FIBER_100G', 'SALT_100G',
'ADDITIVES_N', 'ENERGY-KCAL_100G', 'CARBOHYDRATES_100G',
'PROTEINS_100G'],
dtype='object')
df['NB_NAN']=df.loc[:,var_quant_bis].isna().sum(axis=1)
df = df[df['NB_NAN'] < 7]
df.shape
(578288, 17)
df = df.drop(columns = 'LEN_INGREDIENTS')
corr = df[var_quant_bis].corr()
corr
| SUGARS_100G | SATURATED-FAT_100G | FIBER_100G | SALT_100G | ADDITIVES_N | ENERGY-KCAL_100G | CARBOHYDRATES_100G | PROTEINS_100G | |
|---|---|---|---|---|---|---|---|---|
| SUGARS_100G | 1.000000 | 0.067189 | -0.015776 | -0.089591 | 0.116091 | 0.316501 | 0.690313 | -0.248019 |
| SATURATED-FAT_100G | 0.067189 | 1.000000 | -0.038204 | -0.016419 | -0.005727 | 0.570050 | -0.031493 | 0.134164 |
| FIBER_100G | -0.015776 | -0.038204 | 1.000000 | -0.011249 | -0.071028 | 0.117219 | 0.138123 | 0.134693 |
| SALT_100G | -0.089591 | -0.016419 | -0.011249 | 1.000000 | -0.005123 | -0.056769 | -0.086287 | 0.040917 |
| ADDITIVES_N | 0.116091 | -0.005727 | -0.071028 | -0.005123 | 1.000000 | 0.027624 | 0.118393 | -0.066092 |
| ENERGY-KCAL_100G | 0.316501 | 0.570050 | 0.117219 | -0.056769 | 0.027624 | 1.000000 | 0.459268 | 0.219952 |
| CARBOHYDRATES_100G | 0.690313 | -0.031493 | 0.138123 | -0.086287 | 0.118393 | 0.459268 | 1.000000 | -0.164572 |
| PROTEINS_100G | -0.248019 | 0.134164 | 0.134693 | 0.040917 | -0.066092 | 0.219952 | -0.164572 | 1.000000 |
g = sns.PairGrid(df[var_quant_bis])
g.map(sns.scatterplot)
plt.savefig('P3_422a.png', dpi = 200)
df_G = df[['SUGARS_100G','CARBOHYDRATES_100G', 'PNNS_GROUPS_1','NOVA_GROUP',
'NUTRISCORE_GRADE']]
df_G = df_G[df_G['PNNS_GROUPS_1']!='Cat. Inconnue']
pnns = df_G['PNNS_GROUPS_1'].values
#print(pnns)
#A = df_G.sample(200)
#fig, ax = plt.subplots(figsize = (12,9),dpi=200)
sns.relplot(data = df_G, x = 'CARBOHYDRATES_100G', y = 'SUGARS_100G',
hue = 'PNNS_GROUPS_1',
height=20, aspect=11.7/8.27 )
plt.savefig('P3_422b.png', dpi = 200)
df.to_csv('/Users/JoycyRobert/Documents/JUPYTER/OPC/PROJET3/P3/df_export2.csv',index=False)
L'algorithme de l'iterative imputer est pertinenet lorsque plusieurs variables quantitatives sont corrélées. Or seul les sucres simples et glucides sont corréleés avec un coefficient de Pearson de 0.68. J'applique donc l'algorithme IImp aux variables SUGARS_100G et CARBOHYDRATES_100G.
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
df = pd.read_csv('df_export2.csv',
dtype={'NOVA_GROUP': 'category',
'NUTRISCORE_GRADE': 'category',
'CODE':'object',
'PNNS_GROUPS_1':'category'})
df.head()
| CODE | URL | PRODUCT_NAME | PNNS_GROUPS_1 | INGREDIENTS_TEXT | SUGARS_100G | SATURATED-FAT_100G | FIBER_100G | SALT_100G | ADDITIVES_N | ENERGY-KCAL_100G | CARBOHYDRATES_100G | PROTEINS_100G | NOVA_GROUP | NUTRISCORE_GRADE | NB_NAN | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 8003007816546 | https://world.openfoodfacts.org/product/800300... | TUBETTINI INTEGRALI | Céréales et pomme de terre | SEMOULE COMPLÈTE DE BLE DUR ET EAU | 6.8 | 0.10 | 6.7 | 0.02 | 0.0 | 362.0 | 75.0 | 15.0 | 1 | a | 0 |
| 1 | 3258563461548 | https://world.openfoodfacts.org/product/325856... | TRIPES À LA | Viandes | ESTOMACS ET PIEDS DE BŒUF 77%, FOND DE SAUCE :... | 0.5 | 1.50 | 1.0 | 1.10 | 0.0 | 109.0 | 0.5 | 20.0 | 3 | b | 0 |
| 2 | 3760017370011 | https://world.openfoodfacts.org/product/376001... | POUSSES DE HARICOT | Céréales et pomme de terre | POUSSES DE HARICOT MUNGO | 1.6 | 0.17 | 1.8 | 0.02 | 0.0 | 22.5 | 4.0 | 2.5 | 1 | a | 0 |
| 3 | 3760017370042 | https://world.openfoodfacts.org/product/376001... | POUSSES DE HARICOT | Céréales et pomme de terre | POUSSES DE HARICOT MUNGO 100% | 1.6 | 0.17 | 1.8 | 0.02 | 0.0 | 22.0 | 4.0 | 2.5 | 1 | a | 0 |
| 4 | 3258563461265 | https://world.openfoodfacts.org/product/325856... | MORTADELLE | Viandes | VIANDE DE PORC, SEL, ÉPICES, ARÔMES NATURELS, ... | 0.0 | 9.40 | 0.0 | 1.90 | 2.0 | 285.0 | 0.0 | 15.0 | 4 | e | 0 |
df.isna().mean()
CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.000000 INGREDIENTS_TEXT 0.574155 SUGARS_100G 0.073498 SATURATED-FAT_100G 0.074323 FIBER_100G 0.414326 SALT_100G 0.000000 ADDITIVES_N 0.573187 ENERGY-KCAL_100G 0.104512 CARBOHYDRATES_100G 0.079165 PROTEINS_100G 0.076498 NOVA_GROUP 0.623879 NUTRISCORE_GRADE 0.534550 NB_NAN 0.000000 dtype: float64
var_quant = df.dtypes[df.dtypes == float].index
print(var_quant)
#var_quant_bis = var_quant.drop(['NB_NAN'])
#var_quant_100 = var_quant.drop(['ENERGY-KCAL_100G'])
var_glucide = ['SUGARS_100G','CARBOHYDRATES_100G']
var_glucide
Index(['SUGARS_100G', 'SATURATED-FAT_100G', 'FIBER_100G', 'SALT_100G',
'ADDITIVES_N', 'ENERGY-KCAL_100G', 'CARBOHYDRATES_100G',
'PROTEINS_100G'],
dtype='object')
['SUGARS_100G', 'CARBOHYDRATES_100G']
X = df[var_glucide].values
X
Name = df[var_glucide].columns
Name, X
(Index(['SUGARS_100G', 'CARBOHYDRATES_100G'], dtype='object'),
array([[ 6.8, 75. ],
[ 0.5, 0.5],
[ 1.6, 4. ],
...,
[70. , nan],
[ nan, nan],
[ nan, nan]]))
min_value = np.zeros(1)
max_value = np.array([100,100])
imp_mean = IterativeImputer(random_state=0,
initial_strategy= 'median',
min_value = np.zeros(2),
max_value = np.array([100,100]))
X = imp_mean.fit_transform(X)
df[var_glucide] = X
#Vérification
#OK
df.isna().mean()
CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.000000 INGREDIENTS_TEXT 0.574155 SUGARS_100G 0.000000 SATURATED-FAT_100G 0.074323 FIBER_100G 0.414326 SALT_100G 0.000000 ADDITIVES_N 0.573187 ENERGY-KCAL_100G 0.104512 CARBOHYDRATES_100G 0.000000 PROTEINS_100G 0.076498 NOVA_GROUP 0.623879 NUTRISCORE_GRADE 0.534550 NB_NAN 0.000000 dtype: float64
df.to_csv('/Users/JoycyRobert/Documents/JUPYTER/OPC/PROJET3/P3/df_clean_fIImp.csv',index=False)
Pour le reste des variables nutritives j'utilise l'algorithme du K-NN.
var_quant = var_quant.drop(var_glucide)
var_quant
Index(['SATURATED-FAT_100G', 'FIBER_100G', 'SALT_100G', 'ADDITIVES_N',
'ENERGY-KCAL_100G', 'PROTEINS_100G'],
dtype='object')
A = df.sample(200)
A[A['NB_NAN']>0]
| CODE | URL | PRODUCT_NAME | PNNS_GROUPS_1 | INGREDIENTS_TEXT | SUGARS_100G | SATURATED-FAT_100G | FIBER_100G | SALT_100G | ADDITIVES_N | ENERGY-KCAL_100G | CARBOHYDRATES_100G | PROTEINS_100G | NOVA_GROUP | NUTRISCORE_GRADE | NB_NAN | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 403508 | 5060119298267 | https://world.openfoodfacts.org/product/506011... | SALTED CARAMEL | Cat. Inconnue | NaN | 3.000000 | 6.00 | NaN | 1.55 | NaN | 388.0 | 12.00 | 62.00 | NaN | NaN | 2 |
| 208737 | 3701091701018 | https://world.openfoodfacts.org/product/370109... | FARINE BIO EPEAUTRE | Céréales et pomme de terre | NaN | 1.800000 | 0.30 | 0.0 | 0.01 | NaN | 329.0 | 67.00 | 11.00 | NaN | a | 1 |
| 336688 | 8436542192798 | https://world.openfoodfacts.org/product/843654... | SIROPE DE COCO | Boissons | NaN | 76.500000 | 0.00 | 0.0 | 0.00 | NaN | 312.0 | 78.00 | 0.00 | NaN | NaN | 1 |
| 379536 | 3596710456611 | https://world.openfoodfacts.org/product/359671... | PANACHÉ RAFRAÎCHISSANT | Cat. Inconnue | NaN | 6.800000 | 0.10 | NaN | 0.01 | NaN | 41.0 | 8.00 | 0.50 | NaN | NaN | 2 |
| 402071 | 6009879777017 | https://world.openfoodfacts.org/product/600987... | OHMEGA MACADAMIA BUTTER | Cat. Inconnue | MACADAMIA NUTS | 4.100000 | 11.90 | NaN | 0.50 | 0.0 | 641.0 | 5.40 | 7.80 | NaN | NaN | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 219979 | 3423183300304 | https://world.openfoodfacts.org/product/342318... | CONFITURE DE MYRTILLE | Sucreries | NaN | 57.000000 | 0.00 | 0.0 | 0.00 | NaN | 250.0 | 60.00 | 0.60 | NaN | c | 1 |
| 424982 | 8057737591488 | https://world.openfoodfacts.org/product/805773... | GNOCCHI RIGATI DIGPATATE | Cat. Inconnue | NaN | 4.700000 | 0.20 | NaN | 1.00 | NaN | 129.0 | 25.00 | 3.60 | NaN | NaN | 2 |
| 393955 | 3665018045213 | https://world.openfoodfacts.org/product/366501... | CRÈME GLACÉE NOUGAT | Cat. Inconnue | NaN | 21.450001 | 0.28 | NaN | 0.05 | NaN | 163.0 | 27.35 | 1.61 | NaN | NaN | 2 |
| 422325 | 2609906037653 | https://world.openfoodfacts.org/product/260990... | MELANGE APERETIF SALE | Cat. Inconnue | NaN | 5.900000 | 8.80 | NaN | 1.00 | NaN | 636.0 | 8.40 | 24.00 | NaN | NaN | 2 |
| 283475 | 3343001111397 | https://world.openfoodfacts.org/product/334300... | NOUGAT | Sucreries | NaN | 41.700000 | 1.30 | 0.0 | 0.03 | NaN | 461.0 | 66.50 | 9.00 | NaN | d | 1 |
142 rows × 16 columns
df.head()
| CODE | URL | PRODUCT_NAME | PNNS_GROUPS_1 | INGREDIENTS_TEXT | SUGARS_100G | SATURATED-FAT_100G | FIBER_100G | SALT_100G | ADDITIVES_N | ENERGY-KCAL_100G | CARBOHYDRATES_100G | PROTEINS_100G | NOVA_GROUP | NUTRISCORE_GRADE | NB_NAN | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 8003007816546 | https://world.openfoodfacts.org/product/800300... | TUBETTINI INTEGRALI | Céréales et pomme de terre | SEMOULE COMPLÈTE DE BLE DUR ET EAU | 6.8 | 0.10 | 6.7 | 0.02 | 0.0 | 362.0 | 75.0 | 15.0 | 1 | a | 0 |
| 1 | 3258563461548 | https://world.openfoodfacts.org/product/325856... | TRIPES À LA | Viandes | ESTOMACS ET PIEDS DE BŒUF 77%, FOND DE SAUCE :... | 0.5 | 1.50 | 1.0 | 1.10 | 0.0 | 109.0 | 0.5 | 20.0 | 3 | b | 0 |
| 2 | 3760017370011 | https://world.openfoodfacts.org/product/376001... | POUSSES DE HARICOT | Céréales et pomme de terre | POUSSES DE HARICOT MUNGO | 1.6 | 0.17 | 1.8 | 0.02 | 0.0 | 22.5 | 4.0 | 2.5 | 1 | a | 0 |
| 3 | 3760017370042 | https://world.openfoodfacts.org/product/376001... | POUSSES DE HARICOT | Céréales et pomme de terre | POUSSES DE HARICOT MUNGO 100% | 1.6 | 0.17 | 1.8 | 0.02 | 0.0 | 22.0 | 4.0 | 2.5 | 1 | a | 0 |
| 4 | 3258563461265 | https://world.openfoodfacts.org/product/325856... | MORTADELLE | Viandes | VIANDE DE PORC, SEL, ÉPICES, ARÔMES NATURELS, ... | 0.0 | 9.40 | 0.0 | 1.90 | 2.0 | 285.0 | 0.0 | 15.0 | 4 | e | 0 |
from sklearn.impute import KNNImputer
#Création des échantillons aléatoires
df_shuffled = df.sample(frac=1, random_state = 0)
result = np.array_split(df_shuffled,60)
imputer = KNNImputer(n_neighbors=5, weights = 'distance')
#iris = sns.load_dataset("iris")
#iris = iris.reset_index()
#Par chunk
df_shuffled = df.sample(frac=1, random_state = 0)
result = np.array_split(df_shuffled,60)
df_p = pd.DataFrame()
for part in result:
X = part[var_quant].values
X = imputer.fit_transform(X)
part[var_quant] = X
df_p = pd.concat([df_p,part])
df_p = df_p.sort_index()
df_p.head()
| CODE | URL | PRODUCT_NAME | PNNS_GROUPS_1 | INGREDIENTS_TEXT | SUGARS_100G | SATURATED-FAT_100G | FIBER_100G | SALT_100G | ADDITIVES_N | ENERGY-KCAL_100G | CARBOHYDRATES_100G | PROTEINS_100G | NOVA_GROUP | NUTRISCORE_GRADE | NB_NAN | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 8003007816546 | https://world.openfoodfacts.org/product/800300... | TUBETTINI INTEGRALI | Céréales et pomme de terre | SEMOULE COMPLÈTE DE BLE DUR ET EAU | 6.8 | 0.10 | 6.7 | 0.02 | 0.0 | 362.0 | 75.0 | 15.0 | 1 | a | 0 |
| 1 | 3258563461548 | https://world.openfoodfacts.org/product/325856... | TRIPES À LA | Viandes | ESTOMACS ET PIEDS DE BŒUF 77%, FOND DE SAUCE :... | 0.5 | 1.50 | 1.0 | 1.10 | 0.0 | 109.0 | 0.5 | 20.0 | 3 | b | 0 |
| 2 | 3760017370011 | https://world.openfoodfacts.org/product/376001... | POUSSES DE HARICOT | Céréales et pomme de terre | POUSSES DE HARICOT MUNGO | 1.6 | 0.17 | 1.8 | 0.02 | 0.0 | 22.5 | 4.0 | 2.5 | 1 | a | 0 |
| 3 | 3760017370042 | https://world.openfoodfacts.org/product/376001... | POUSSES DE HARICOT | Céréales et pomme de terre | POUSSES DE HARICOT MUNGO 100% | 1.6 | 0.17 | 1.8 | 0.02 | 0.0 | 22.0 | 4.0 | 2.5 | 1 | a | 0 |
| 4 | 3258563461265 | https://world.openfoodfacts.org/product/325856... | MORTADELLE | Viandes | VIANDE DE PORC, SEL, ÉPICES, ARÔMES NATURELS, ... | 0.0 | 9.40 | 0.0 | 1.90 | 2.0 | 285.0 | 0.0 | 15.0 | 4 | e | 0 |
df = df_p
df.to_csv('/Users/JoycyRobert/Documents/JUPYTER/OPC/PROJET3/P3/df_clean_fKnn.csv',index=False)
df.isna().mean()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Input In [1], in <cell line: 1>() ----> 1 df.isna().mean() NameError: name 'df' is not defined
maxadd = df['ADDITIVES_N'].max()
bins = pd.IntervalIndex.from_tuples([(-0.001, 0), (0, 5),(5, maxadd)])
bins
df['ADDITIVES_N'] = pd.cut(df['ADDITIVES_N'],bins,labels = ['Sans additifs', '1 à 5','5+'])
mapping = {bins[0] : 'Sans additifs',
bins[1]:'1 à 5',
bins[2] :'5+'}
df['ADDITIVES_N'] = df['ADDITIVES_N'].replace(mapping)
df['ADDITIVES_N'] = df['ADDITIVES_N'].astype('category')
Stratégie : Utiliser l'algorithme des K-NN pour traiter les valeurs manquantes des variables :
Utilise
PNNS_GROUPS_1
var_quant = var_quant.str.upper()
var_quant
Index(['SATURATED-FAT_100G', 'FIBER_100G', 'SALT_100G', 'ADDITIVES_N',
'ENERGY-KCAL_100G', 'PROTEINS_100G'],
dtype='object')
var_quant_100 = var_quant_100.str.upper()
var_quant_100
Index(['SUGARS_100G', 'SATURATED-FAT_100G', 'FIBER_100G', 'SALT_100G',
'CARBOHYDRATES_100G', 'PROTEINS_100G'],
dtype='object')
#A = df.sample(128000)
df.isna().mean()
CODE 0.000000 URL 0.000000 PRODUCT_NAME 0.000000 PNNS_GROUPS_1 0.000000 INGREDIENTS_TEXT 0.574155 SUGARS_100G 0.000000 SATURATED-FAT_100G 0.000000 FIBER_100G 0.000000 SALT_100G 0.000000 ADDITIVES_N 0.000000 ENERGY-KCAL_100G 0.000000 CARBOHYDRATES_100G 0.000000 PROTEINS_100G 0.000000 NOVA_GROUP 0.623879 NUTRISCORE_GRADE 0.534550 NB_NAN 0.000000 dtype: float64
df.PNNS_GROUPS_1.unique()
['Céréales et pomme de terre', 'Viandes', 'Produits Laitiers', 'Sucreries', 'Plats cuisinés', ..., 'Fruits et légumes', 'Matières grasses et sauces', 'Boissons', 'Cat. Inconnue', 'Assaisonnements'] Length: 11 Categories (11, object): ['Boissons', 'Collation salée', 'Céréales et pomme de terre', 'Fruits et légumes', ..., 'Sucreries', 'Viandes', 'Cat. Inconnue', 'Assaisonnements']